diff --git a/README.md b/README.md
index 3f94c149a..70346f3d3 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ We provide reference implementations for each of the 7 benchmarks in the MLPerf
 
 * image_classification - Resnet-50 v1 applied to Imagenet.
 * object_detection - Mask R-CNN applied to COCO. 
+* single_stage_detector - SSD applied to COCO 2017.
 * speech_recognition - DeepSpeech2 applied to Librispeech.
 * translation - Transformer applied to WMT English-German.
 * recommendation - Neural Collaborative Filtering applied to MovieLens 20 Million (ml-20m).
diff --git a/single_stage_detector/Dockerfile b/single_stage_detector/Dockerfile
new file mode 100755
index 000000000..af2632105
--- /dev/null
+++ b/single_stage_detector/Dockerfile
@@ -0,0 +1,20 @@
+FROM pytorch/pytorch:0.4_cuda9_cudnn7
+
+# Set working directory
+WORKDIR /mlperf
+
+RUN apt-get update && \
+    apt-get install -y python3-tk python-pip
+
+# Necessary pip packages
+RUN pip install --upgrade pip
+RUN pip install Cython==0.28.4 \
+                matplotlib==2.2.2
+RUN python3 -m pip install pycocotools==2.0.0
+
+# Copy SSD code
+WORKDIR /mlperf
+COPY . .
+RUN pip install -r requirements.txt
+
+WORKDIR /mlperf/ssd
diff --git a/single_stage_detector/download_dataset.sh b/single_stage_detector/download_dataset.sh
new file mode 100755
index 000000000..3d0712519
--- /dev/null
+++ b/single_stage_detector/download_dataset.sh
@@ -0,0 +1,7 @@
+# Get COCO 2017 data sets
+dir=$(pwd)
+mkdir /coco; cd /coco
+curl -O http://images.cocodataset.org/zips/train2017.zip; unzip train2017.zip
+curl -O http://images.cocodataset.org/zips/val2017.zip; unzip val2017.zip
+curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip; unzip annotations_trainval2017.zip
+cd $dir
diff --git a/single_stage_detector/requirements.txt b/single_stage_detector/requirements.txt
new file mode 100755
index 000000000..131c6adea
--- /dev/null
+++ b/single_stage_detector/requirements.txt
@@ -0,0 +1,12 @@
+cycler==0.10.0
+kiwisolver==1.0.1
+matplotlib==2.2.2
+numpy==1.14.5
+Pillow==5.2.0
+pycocotools==2.0.0
+pyparsing==2.2.0
+python-dateutil==2.7.3
+pytz==2018.5
+six==1.11.0
+torch==0.4.0
+torchvision==0.2.1
diff --git a/single_stage_detector/ssd/README.md b/single_stage_detector/ssd/README.md
new file mode 100644
index 000000000..4066f45f4
--- /dev/null
+++ b/single_stage_detector/ssd/README.md
@@ -0,0 +1,71 @@
+# 1. Problem
+Object detection.
+
+# 2. Directions
+
+### Steps to configure machine
+From Source
+
+Standard script.
+
+From Docker
+1. Checkout the MLPerf repository
+```
+git clone https://github.com/mlperf/reference.git
+```
+2. Install CUDA and Docker
+```
+source reference/install_cuda_docker.sh
+```
+3. Build the docker image for the single stage detection task
+```
+# Build from Dockerfile
+cd reference/single_stage_detector/
+sudo docker build -t mlperf/single_stage_detector .
+```
+
+### Steps to download data
+```
+cd reference/single_stage_detector/
+source download_dataset.sh
+```
+
+### Steps to run benchmark.
+From Source
+
+Run the run_and_time.sh script
+```
+cd reference/single_stage_detector/ssd
+source run_and_time.sh SEED TARGET
+```
+where SEED is the random seed for a run, TARGET is the quality target from Section 5 below.
+
+Docker Image
+```
+sudo nvidia-docker run -v /coco:/coco -t -i --rm --ipc=host mlperf/single_stage_detector ./run_and_time.sh SEED TARGET
+```
+
+# 3. Dataset/Environment
+### Publiction/Attribution.
+Microsoft COCO: COmmon Objects in Context. 2017.
+
+### Training and test data separation
+Train on 2017 COCO train data set, compute mAP on 2017 COCO val data set.
+
+# 4. Model.
+### Publication/Attribution
+Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector. In the Proceedings of the European Conference on Computer Vision (ECCV), 2016.
+
+Backbone is ResNet34 pretrained on ILSVRC 2012 (from torchvision). Modifications to the backbone networks: remove conv_5x residual blocks, change the first 3x3 convolution of the conv_4x block from stride 2 to stride1 (this increases the resolution of the feature map to which detector heads are attached), attach all 6 detector heads to the output of the last conv_4x residual block. Thus detections are attached to 38x38, 19x19, 10x10, 5x5, 3x3, and 1x1 feature maps. Convolutions in the detector layers are followed by batch normalization layers.
+
+# 5. Quality.
+### Quality metric
+Metric is COCO box mAP (averaged over IoU of 0.5:0.95), computed over 2017 COCO val data.
+
+### Quality target
+mAP of 0.212
+
+### Evaluation frequency
+
+### Evaluation thoroughness
+All the images in COCO 2017 val data set.
diff --git a/single_stage_detector/ssd/base_model.py b/single_stage_detector/ssd/base_model.py
new file mode 100644
index 000000000..04f39ff33
--- /dev/null
+++ b/single_stage_detector/ssd/base_model.py
@@ -0,0 +1,206 @@
+"""
+    Load the vgg16 weight and save it to special file
+"""
+
+#from torchvision.models.vgg import vgg16
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+from torch.autograd import Variable
+from collections import OrderedDict
+
+from torchvision.models.resnet import resnet18, resnet34, resnet50
+
+def _ModifyConvStrideDilation(conv, stride=(1, 1), padding=None):
+    conv.stride = stride
+
+    if padding is not None:
+        conv.padding = padding
+
+def _ModifyBlock(block, bottleneck=False, **kwargs):
+    for m in list(block.children()):
+        if bottleneck:
+           _ModifyConvStrideDilation(m.conv2, **kwargs)
+        else:
+           _ModifyConvStrideDilation(m.conv1, **kwargs)
+
+        if m.downsample is not None:
+            # need to make sure no padding for the 1x1 residual connection
+            _ModifyConvStrideDilation(list(m.downsample.children())[0], **kwargs)
+
+class ResNet18(nn.Module):
+    def __init__(self):
+        super().__init__()
+        rn18 = resnet18(pretrained=True)
+
+
+        # discard last Resnet block, avrpooling and classification FC
+        # layer1 = up to and including conv3 block
+        self.layer1 = nn.Sequential(*list(rn18.children())[:6])
+        # layer2 = conv4 block only
+        self.layer2 = nn.Sequential(*list(rn18.children())[6:7])
+
+        # modify conv4 if necessary
+        # Always deal with stride in first block
+        modulelist = list(self.layer2.children())
+        _ModifyBlock(modulelist[0], stride=(1,1))
+
+    def forward(self, data):
+        layer1_activation = self.layer1(data)
+        x = layer1_activation
+        layer2_activation = self.layer2(x)
+
+        # Only need the output of conv4
+        return [layer2_activation]
+
+class ResNet34(nn.Module):
+    def __init__(self):
+        super().__init__()
+        rn34 = resnet34(pretrained=True)
+
+        # discard last Resnet block, avrpooling and classification FC
+        self.layer1 = nn.Sequential(*list(rn34.children())[:6])
+        self.layer2 = nn.Sequential(*list(rn34.children())[6:7])
+        # modify conv4 if necessary
+        # Always deal with stride in first block
+        modulelist = list(self.layer2.children())
+        _ModifyBlock(modulelist[0], stride=(1,1))
+
+
+    def forward(self, data):
+        layer1_activation = self.layer1(data)
+        x = layer1_activation
+        layer2_activation = self.layer2(x)
+
+        return [layer2_activation]
+
+class L2Norm(nn.Module):
+    """
+       Scale shall be learnable according to original paper
+       scale: initial scale number
+       chan_num: L2Norm channel number (norm over all channels)
+    """
+    def __init__(self, scale=20, chan_num=512):
+        super(L2Norm, self).__init__()
+        # Scale across channels
+        self.scale = \
+            nn.Parameter(torch.Tensor([scale]*chan_num).view(1, chan_num, 1, 1))
+
+    def forward(self, data):
+        # normalize accross channel
+        return self.scale*data*data.pow(2).sum(dim=1, keepdim=True).clamp(min=1e-12).rsqrt()
+
+
+
+def tailor_module(src_model, src_dir, tgt_model, tgt_dir):
+    state = torch.load(src_dir)
+    src_model.load_state_dict(state)
+    src_state = src_model.state_dict()
+    # only need features
+    keys1 = src_state.keys()
+    keys1 = [k for k in src_state.keys() if k.startswith("features")]
+    keys2 = tgt_model.state_dict().keys()
+
+    assert len(keys1) == len(keys2)
+    state = OrderedDict()
+
+    for k1, k2 in zip(keys1, keys2):
+        # print(k1, k2)
+        state[k2] = src_state[k1]
+    #diff_keys = state.keys() - target_model.state_dict().keys()
+    #print("Different Keys:", diff_keys)
+    # Remove unecessary keys
+    #for k in diff_keys:
+    #    state.pop(k)
+    tgt_model.load_state_dict(state)
+    torch.save(tgt_model.state_dict(), tgt_dir)
+
+# Default vgg16 in pytorch seems different from ssd
+def make_layers(cfg, batch_norm=False):
+    layers = []
+    in_channels = 3
+    for v in cfg:
+        if v == 'M':
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        elif v == 'C':
+            # Notice ceil_mode is true
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
+        else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            if batch_norm:
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+            else:
+                layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    return layers
+
+class Loss(nn.Module):
+    """
+        Implements the loss as the sum of the followings:
+        1. Confidence Loss: All labels, with hard negative mining
+        2. Localization Loss: Only on positive labels
+        Suppose input dboxes has the shape 8732x4
+    """
+
+    def __init__(self, dboxes):
+        super(Loss, self).__init__()
+        self.scale_xy = 1.0/dboxes.scale_xy
+        self.scale_wh = 1.0/dboxes.scale_wh
+
+        self.sl1_loss = nn.SmoothL1Loss(reduce=False)
+        self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim = 0),
+            requires_grad=False)
+        # Two factor are from following links
+        # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
+        self.con_loss = nn.CrossEntropyLoss(reduce=False)
+
+    def _loc_vec(self, loc):
+        """
+            Generate Location Vectors
+        """
+        gxy = self.scale_xy*(loc[:, :2, :] - self.dboxes[:, :2, :])/self.dboxes[:, 2:, ]
+        gwh = self.scale_wh*(loc[:, 2:, :]/self.dboxes[:, 2:, :]).log()
+
+        return torch.cat((gxy, gwh), dim=1).contiguous()
+
+    def forward(self, ploc, plabel, gloc, glabel):
+        """
+            ploc, plabel: Nx4x8732, Nxlabel_numx8732
+                predicted location and labels
+
+            gloc, glabel: Nx4x8732, Nx8732
+                ground truth location and labels
+        """
+
+        mask = glabel > 0
+        pos_num = mask.sum(dim=1)
+
+        vec_gd = self._loc_vec(gloc)
+
+        # sum on four coordinates, and mask
+        sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1)
+        sl1 = (mask.float()*sl1).sum(dim=1)
+
+        # hard negative mining
+        con = self.con_loss(plabel, glabel)
+
+        # postive mask will never selected
+        con_neg = con.clone()
+        con_neg[mask] = 0
+        _, con_idx = con_neg.sort(dim=1, descending=True)
+        _, con_rank = con_idx.sort(dim=1)
+
+        # number of negative three times positive
+        neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1)
+        neg_mask = con_rank < neg_num
+
+        closs = (con*(mask.float() + neg_mask.float())).sum(dim=1)
+
+        # avoid no object detected
+        total_loss = sl1 + closs
+        num_mask = (pos_num > 0).float()
+        pos_num = pos_num.float().clamp(min=1e-6)
+
+        ret = (total_loss*num_mask/pos_num).mean(dim=0)
+        return ret
+
diff --git a/single_stage_detector/ssd/coco.py b/single_stage_detector/ssd/coco.py
new file mode 100755
index 000000000..dd0e880be
--- /dev/null
+++ b/single_stage_detector/ssd/coco.py
@@ -0,0 +1,433 @@
+__author__ = 'tylin'
+__version__ = '2.0'
+# Interface for accessing the Microsoft COCO dataset.
+
+# Microsoft COCO is a large image dataset designed for object detection,
+# segmentation, and caption generation. pycocotools is a Python API that
+# assists in loading, parsing and visualizing the annotations in COCO.
+# Please visit http://mscoco.org/ for more information on COCO, including
+# for the data, paper, and tutorials. The exact format of the annotations
+# is also described on the COCO website. For example usage of the pycocotools
+# please see pycocotools_demo.ipynb. In addition to this API, please download both
+# the COCO images and annotations in order to run the demo.
+
+# An alternative to using the API is to load the annotations directly
+# into Python dictionary
+# Using the API provides additional utility functions. Note that this API
+# supports both *instance* and *caption* annotations. In the case of
+# captions not all functions are defined (e.g. categories are undefined).
+
+# The following API functions are defined:
+#  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
+#  decodeMask - Decode binary mask M encoded via run-length encoding.
+#  encodeMask - Encode binary mask M using run-length encoding.
+#  getAnnIds  - Get ann ids that satisfy given filter conditions.
+#  getCatIds  - Get cat ids that satisfy given filter conditions.
+#  getImgIds  - Get img ids that satisfy given filter conditions.
+#  loadAnns   - Load anns with the specified ids.
+#  loadCats   - Load cats with the specified ids.
+#  loadImgs   - Load imgs with the specified ids.
+#  annToMask  - Convert segmentation in an annotation to binary mask.
+#  showAnns   - Display the specified annotations.
+#  loadRes    - Load algorithm results and create API for accessing them.
+#  download   - Download COCO images from mscoco.org server.
+# Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
+# Help on each functions can be accessed by: "help COCO>function".
+
+# See also COCO>decodeMask,
+# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
+# COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
+# COCO>loadImgs, COCO>annToMask, COCO>showAnns
+
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
+# Licensed under the Simplified BSD License [see bsd.txt]
+
+import json
+import time
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon
+import numpy as np
+import copy
+import itertools
+from pycocotools import mask as maskUtils
+import os
+from collections import defaultdict
+import sys
+PYTHON_VERSION = sys.version_info[0]
+if PYTHON_VERSION == 2:
+    from urllib import urlretrieve
+elif PYTHON_VERSION == 3:
+    from urllib.request import urlretrieve
+
+
+def _isArrayLike(obj):
+    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+
+
+class COCO:
+    def __init__(self, annotation_file=None):
+        """
+        Constructor of Microsoft COCO helper class for reading and visualizing annotations.
+        :param annotation_file (str): location of annotation file
+        :param image_folder (str): location to the folder that hosts images.
+        :return:
+        """
+        # load dataset
+        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
+        self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
+        if not annotation_file == None:
+            print('loading annotations into memory...')
+            tic = time.time()
+            dataset = json.load(open(annotation_file, 'r'))
+            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
+            print('Done (t={:0.2f}s)'.format(time.time()- tic))
+            self.dataset = dataset
+            self.createIndex()
+
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        anns, cats, imgs = {}, {}, {}
+        imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                imgToAnns[ann['image_id']].append(ann)
+                anns[ann['id']] = ann
+
+        if 'images' in self.dataset:
+            for img in self.dataset['images']:
+                imgs[img['id']] = img
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                catToImgs[ann['category_id']].append(ann['image_id'])
+
+        print('index created!')
+
+        # create class members
+        self.anns = anns
+        self.imgToAnns = imgToAnns
+        self.catToImgs = catToImgs
+        self.imgs = imgs
+        self.cats = cats
+
+    def info(self):
+        """
+        Print information about the annotation file.
+        :return:
+        """
+        for key, value in self.dataset['info'].items():
+            print('{}: {}'.format(key, value))
+
+    def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
+        """
+        Get ann ids that satisfy given filter conditions. default skips that filter
+        :param imgIds  (int array)     : get anns for given imgs
+               catIds  (int array)     : get anns for given cats
+               areaRng (float array)   : get anns for given area range (e.g. [0 inf])
+               iscrowd (boolean)       : get anns for given crowd label (False or True)
+        :return: ids (int array)       : integer array of ann ids
+        """
+        imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(imgIds) == len(catIds) == len(areaRng) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(imgIds) == 0:
+                lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.dataset['annotations']
+            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
+            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
+        if not iscrowd == None:
+            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+        else:
+            ids = [ann['id'] for ann in anns]
+        return ids
+
+    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
+        """
+        filtering parameters. default skips that filter.
+        :param catNms (str array)  : get cats for given cat names
+        :param supNms (str array)  : get cats for given supercategory names
+        :param catIds (int array)  : get cats for given cat ids
+        :return: ids (int array)   : integer array of cat ids
+        """
+        catNms = catNms if _isArrayLike(catNms) else [catNms]
+        supNms = supNms if _isArrayLike(supNms) else [supNms]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(catNms) == len(supNms) == len(catIds) == 0:
+            cats = self.dataset['categories']
+        else:
+            cats = self.dataset['categories']
+            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
+            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
+            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
+        ids = [cat['id'] for cat in cats]
+        return ids
+
+    def getImgIds(self, imgIds=[], catIds=[]):
+        '''
+        Get img ids that satisfy given filter conditions.
+        :param imgIds (int array) : get imgs for given ids
+        :param catIds (int array) : get imgs with all given cats
+        :return: ids (int array)  : integer array of img ids
+        '''
+        imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(imgIds) == len(catIds) == 0:
+            ids = self.imgs.keys()
+        else:
+            ids = set(imgIds)
+            for i, catId in enumerate(catIds):
+                if i == 0 and len(ids) == 0:
+                    ids = set(self.catToImgs[catId])
+                else:
+                    ids &= set(self.catToImgs[catId])
+        return list(ids)
+
+    def loadAnns(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying anns
+        :return: anns (object array) : loaded ann objects
+        """
+        if _isArrayLike(ids):
+            return [self.anns[id] for id in ids]
+        elif type(ids) == int:
+            return [self.anns[ids]]
+
+    def loadCats(self, ids=[]):
+        """
+        Load cats with the specified ids.
+        :param ids (int array)       : integer ids specifying cats
+        :return: cats (object array) : loaded cat objects
+        """
+        if _isArrayLike(ids):
+            return [self.cats[id] for id in ids]
+        elif type(ids) == int:
+            return [self.cats[ids]]
+
+    def loadImgs(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying img
+        :return: imgs (object array) : loaded img objects
+        """
+        if _isArrayLike(ids):
+            return [self.imgs[id] for id in ids]
+        elif type(ids) == int:
+            return [self.imgs[ids]]
+
+    def showAnns(self, anns):
+        """
+        Display the specified annotations.
+        :param anns (array of object): annotations to display
+        :return: None
+        """
+        if len(anns) == 0:
+            return 0
+        if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
+            datasetType = 'instances'
+        elif 'caption' in anns[0]:
+            datasetType = 'captions'
+        else:
+            raise Exception('datasetType not supported')
+        if datasetType == 'instances':
+            ax = plt.gca()
+            ax.set_autoscale_on(False)
+            polygons = []
+            color = []
+            for ann in anns:
+                c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
+                if 'segmentation' in ann:
+                    if type(ann['segmentation']) == list:
+                        # polygon
+                        for seg in ann['segmentation']:
+                            poly = np.array(seg).reshape((int(len(seg)/2), 2))
+                            polygons.append(Polygon(poly))
+                            color.append(c)
+                    else:
+                        # mask
+                        t = self.imgs[ann['image_id']]
+                        if type(ann['segmentation']['counts']) == list:
+                            rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
+                        else:
+                            rle = [ann['segmentation']]
+                        m = maskUtils.decode(rle)
+                        img = np.ones( (m.shape[0], m.shape[1], 3) )
+                        if ann['iscrowd'] == 1:
+                            color_mask = np.array([2.0,166.0,101.0])/255
+                        if ann['iscrowd'] == 0:
+                            color_mask = np.random.random((1, 3)).tolist()[0]
+                        for i in range(3):
+                            img[:,:,i] = color_mask[i]
+                        ax.imshow(np.dstack( (img, m*0.5) ))
+                if 'keypoints' in ann and type(ann['keypoints']) == list:
+                    # turn skeleton into zero-based index
+                    sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
+                    kp = np.array(ann['keypoints'])
+                    x = kp[0::3]
+                    y = kp[1::3]
+                    v = kp[2::3]
+                    for sk in sks:
+                        if np.all(v[sk]>0):
+                            plt.plot(x[sk],y[sk], linewidth=3, color=c)
+                    plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
+                    plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
+            p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
+            ax.add_collection(p)
+            p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
+            ax.add_collection(p)
+        elif datasetType == 'captions':
+            for ann in anns:
+                print(ann['caption'])
+
+    def loadRes(self, resFile):
+        """
+        Load result file and return a result api object.
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = COCO()
+        res.dataset['images'] = [img for img in self.dataset['images']]
+
+        print('Loading and preparing results...')
+        tic = time.time()
+        if type(resFile) == str: #or type(resFile) == unicode:
+            anns = json.load(open(resFile))
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, 'results in not an array of objects'
+        annsImgIds = [ann['image_id'] for ann in anns]
+        assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
+               'Results do not correspond to current coco set'
+        if 'caption' in anns[0]:
+            imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
+            res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
+            for id, ann in enumerate(anns):
+                ann['id'] = id+1
+        elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                bb = ann['bbox']
+                x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
+                if not 'segmentation' in ann:
+                    ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+                ann['area'] = bb[2]*bb[3]
+                ann['id'] = id+1
+                ann['iscrowd'] = 0
+        elif 'segmentation' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                # now only support compressed RLE format as segmentation results
+                ann['area'] = maskUtils.area(ann['segmentation'])
+                if not 'bbox' in ann:
+                    ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
+                ann['id'] = id+1
+                ann['iscrowd'] = 0
+        elif 'keypoints' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                s = ann['keypoints']
+                x = s[0::3]
+                y = s[1::3]
+                x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+                ann['area'] = (x1-x0)*(y1-y0)
+                ann['id'] = id + 1
+                ann['bbox'] = [x0,y0,x1-x0,y1-y0]
+        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
+
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
+
+    def download(self, tarDir = None, imgIds = [] ):
+        '''
+        Download COCO images from mscoco.org server.
+        :param tarDir (str): COCO results directory name
+               imgIds (list): images to be downloaded
+        :return:
+        '''
+        if tarDir is None:
+            print('Please specify target directory')
+            return -1
+        if len(imgIds) == 0:
+            imgs = self.imgs.values()
+        else:
+            imgs = self.loadImgs(imgIds)
+        N = len(imgs)
+        if not os.path.exists(tarDir):
+            os.makedirs(tarDir)
+        for i, img in enumerate(imgs):
+            tic = time.time()
+            fname = os.path.join(tarDir, img['file_name'])
+            if not os.path.exists(fname):
+                urlretrieve(img['coco_url'], fname)
+            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
+
+    def loadNumpyAnnotations(self, data):
+        """
+        Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
+        :param  data (numpy.ndarray)
+        :return: annotations (python nested list)
+        """
+        print('Converting ndarray to lists...')
+        assert(type(data) == np.ndarray)
+        print(data.shape)
+        assert(data.shape[1] == 7)
+        N = data.shape[0]
+        ann = []
+        for i in range(N):
+            if i % 1000000 == 0:
+                print('{}/{}'.format(i,N))
+            ann += [{
+                'image_id'  : int(data[i, 0]),
+                'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
+                'score' : data[i, 5],
+                'category_id': int(data[i, 6]),
+                }]
+        return ann
+
+    def annToRLE(self, ann):
+        """
+        Convert annotation which can be polygons, uncompressed RLE to RLE.
+        :return: binary mask (numpy 2D array)
+        """
+        t = self.imgs[ann['image_id']]
+        h, w = t['height'], t['width']
+        segm = ann['segmentation']
+        if type(segm) == list:
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(segm, h, w)
+            rle = maskUtils.merge(rles)
+        elif type(segm['counts']) == list:
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(segm, h, w)
+        else:
+            # rle
+            rle = ann['segmentation']
+        return rle
+
+    def annToMask(self, ann):
+        """
+        Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
+        :return: binary mask (numpy 2D array)
+        """
+        rle = self.annToRLE(ann)
+        m = maskUtils.decode(rle)
+        return m
diff --git a/single_stage_detector/ssd/distributed.py b/single_stage_detector/ssd/distributed.py
new file mode 100644
index 000000000..7776997fc
--- /dev/null
+++ b/single_stage_detector/ssd/distributed.py
@@ -0,0 +1,82 @@
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+import torch.distributed as dist
+from torch.nn.modules import Module
+
+'''
+This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
+launcher included with this example. It assumes that your run is using multiprocess with 1
+GPU/process, that the model is on the correct device, and that torch.set_device has been
+used to set the device.
+
+Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
+and will be allreduced at the finish of the backward pass.
+'''
+class DistributedDataParallel(Module):
+
+    def __init__(self, module):
+        super(DistributedDataParallel, self).__init__()
+        self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
+
+        self.module = module
+
+        for p in self.module.state_dict().values():
+            if not torch.is_tensor(p):
+                continue
+            if dist._backend == dist.dist_backend.NCCL:
+                assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU."
+            dist.broadcast(p, 0)
+
+        def allreduce_params():
+            if(self.needs_reduction):
+                self.needs_reduction = False
+                buckets = {}
+                for param in self.module.parameters():
+                    if param.requires_grad and param.grad is not None:
+                        tp = param.data.type()
+                        if tp not in buckets:
+                            buckets[tp] = []
+                        buckets[tp].append(param)
+                if self.warn_on_half:
+                    if torch.cuda.HalfTensor in buckets:
+                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
+                              " It is recommended to use the NCCL backend in this case.")
+                        self.warn_on_half = False
+
+                for tp in buckets:
+                    bucket = buckets[tp]
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    dist.all_reduce(coalesced)
+                    coalesced /= dist.get_world_size()
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
+
+        for param in list(self.module.parameters()):
+            def allreduce_hook(*unused):
+                param._execution_engine.queue_callback(allreduce_params)
+            if param.requires_grad:
+                param.register_hook(allreduce_hook)
+
+    def forward(self, *inputs, **kwargs):
+        self.needs_reduction = True
+        return self.module(*inputs, **kwargs)
+
+    '''
+    def _sync_buffers(self):
+        buffers = list(self.module._all_buffers())
+        if len(buffers) > 0:
+            # cross-node buffer sync
+            flat_buffers = _flatten_dense_tensors(buffers)
+            dist.broadcast(flat_buffers, 0)
+            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
+                buf.copy_(synced)
+     def train(self, mode=True):
+        # Clear NCCL communicator and CUDA event cache of the default group ID,
+        # These cache will be recreated at the later call. This is currently a
+        # work-around for a potential NCCL deadlock.
+        if dist._backend == dist.dist_backend.NCCL:
+            dist._clear_group_cache()
+        super(DistributedDataParallel, self).train(mode)
+        self.module.train(mode)
+    '''
diff --git a/single_stage_detector/ssd/eval.py b/single_stage_detector/ssd/eval.py
new file mode 100644
index 000000000..82075a55d
--- /dev/null
+++ b/single_stage_detector/ssd/eval.py
@@ -0,0 +1,193 @@
+import numpy as np
+import xml.etree.ElementTree as ET
+import pickle
+import os
+
+def voc_ap(rec, prec, use_07_metric=True):
+    """ ap = voc_ap(rec, prec, [use_07_metric])
+    Compute VOC AP given precision and recall.
+    If use_07_metric is true, uses the
+    VOC 07 11 point method (default:True).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def parse_rec(filename):
+    """ Parse a PASCAL VOC xml file """
+    tree = ET.parse(filename)
+    objects = []
+    for obj in tree.findall('object'):
+        obj_struct = {}
+        obj_struct['name'] = obj.find('name').text
+        obj_struct['pose'] = obj.find('pose').text
+        obj_struct['truncated'] = int(obj.find('truncated').text)
+        obj_struct['difficult'] = int(obj.find('difficult').text)
+        bbox = obj.find('bndbox')
+        obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1,
+                              int(bbox.find('ymin').text) - 1,
+                              int(bbox.find('xmax').text) - 1,
+                              int(bbox.find('ymax').text) - 1]
+        objects.append(obj_struct)
+
+    return objects
+
+def voc_eval(detpath,
+             annopath,
+             imagesetfile,
+             classname,
+             cachedir,
+             ovthresh=0.5,
+             use_07_metric=True):
+    """rec, prec, ap = voc_eval(detpath,
+                           annopath,
+                           imagesetfile,
+                           classname,
+                           [ovthresh],
+                           [use_07_metric])
+Top level function that does the PASCAL VOC evaluation.
+detpath: Path to detections
+   detpath.format(classname) should produce the detection results file.
+annopath: Path to annotations
+   annopath.format(imagename) should be the xml annotations file.
+imagesetfile: Text file containing the list of images, one image per line.
+classname: Category name (duh)
+cachedir: Directory for caching the annotations
+[ovthresh]: Overlap threshold (default = 0.5)
+[use_07_metric]: Whether to use VOC07's 11 point AP computation
+   (default True)
+"""
+# assumes detections are in detpath.format(classname)
+# assumes annotations are in annopath.format(imagename)
+# assumes imagesetfile is a text file with each line an image name
+# cachedir caches the annotations in a pickle file
+# first load gt
+    if not os.path.isdir(cachedir):
+        os.mkdir(cachedir)
+    cachefile = os.path.join(cachedir, 'annots.pkl')
+    # read list of images
+    with open(imagesetfile, 'r') as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+    if not os.path.isfile(cachefile):
+        # load annots
+        recs = {}
+        for i, imagename in enumerate(imagenames):
+            recs[imagename] = parse_rec(annopath % (imagename))
+            if i % 100 == 0:
+                print('Reading annotation for {:d}/{:d}'.format(
+                   i + 1, len(imagenames)))
+        # save
+        print('Saving cached annotations to {:s}'.format(cachefile))
+        with open(cachefile, 'wb') as f:
+            pickle.dump(recs, f)
+    else:
+        # load
+        with open(cachefile, 'rb') as f:
+            recs = pickle.load(f)
+
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj['name'] == classname]
+        bbox = np.array([x['bbox'] for x in R])
+        difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {'bbox': bbox,
+                                 'difficult': difficult,
+                                 'det': det}
+
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, 'r') as f:
+        lines = f.readlines()
+    if any(lines) == 1:
+
+        splitlines = [x.strip().split(' ') for x in lines]
+        image_ids = [x[0] for x in splitlines]
+        confidence = np.array([float(x[1]) for x in splitlines])
+        BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+        # sort by confidence
+        sorted_ind = np.argsort(-confidence)
+        sorted_scores = np.sort(-confidence)
+        BB = BB[sorted_ind, :]
+        image_ids = [image_ids[x] for x in sorted_ind]
+
+        # go down dets and mark TPs and FPs
+        nd = len(image_ids)
+        tp = np.zeros(nd)
+        fp = np.zeros(nd)
+        for d in range(nd):
+            R = class_recs[image_ids[d]]
+            bb = BB[d, :].astype(float)
+            ovmax = -np.inf
+            BBGT = R['bbox'].astype(float)
+            if BBGT.size > 0:
+                # compute overlaps
+                # intersection
+                ixmin = np.maximum(BBGT[:, 0], bb[0])
+                iymin = np.maximum(BBGT[:, 1], bb[1])
+                ixmax = np.minimum(BBGT[:, 2], bb[2])
+                iymax = np.minimum(BBGT[:, 3], bb[3])
+                iw = np.maximum(ixmax - ixmin, 0.)
+                ih = np.maximum(iymax - iymin, 0.)
+                inters = iw * ih
+                uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
+                       (BBGT[:, 2] - BBGT[:, 0]) *
+                       (BBGT[:, 3] - BBGT[:, 1]) - inters)
+                overlaps = inters / uni
+                ovmax = np.max(overlaps)
+                jmax = np.argmax(overlaps)
+
+            if ovmax > ovthresh:
+                if not R['difficult'][jmax]:
+                    if not R['det'][jmax]:
+                        tp[d] = 1.
+                        R['det'][jmax] = 1
+                    else:
+                        fp[d] = 1.
+            else:
+                fp[d] = 1.
+
+        # compute precision recall
+        fp = np.cumsum(fp)
+        tp = np.cumsum(tp)
+        rec = tp / float(npos)
+        # avoid divide by zero in case the first detection matches a difficult
+        # ground truth
+        prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+        ap = voc_ap(rec, prec, use_07_metric)
+    else:
+        rec = -1.
+        prec = -1.
+        ap = -1.
+
+    return rec, prec, ap
+
diff --git a/single_stage_detector/ssd/run_and_time.sh b/single_stage_detector/ssd/run_and_time.sh
new file mode 100755
index 000000000..01b9ef983
--- /dev/null
+++ b/single_stage_detector/ssd/run_and_time.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# runs benchmark and reports time to convergence
+# to use the script:
+#   run_and_time.sh <random seed> <target threshold>
+
+SEED=${1:-1}
+TARGET=${2:-0.212}
+
+time stdbuf -o 0 \
+  python3 train.py --seed $SEED --threshold $TARGET | tee run.log.$SEED
diff --git a/single_stage_detector/ssd/ssd300.py b/single_stage_detector/ssd/ssd300.py
new file mode 100644
index 000000000..177b42fae
--- /dev/null
+++ b/single_stage_detector/ssd/ssd300.py
@@ -0,0 +1,141 @@
+import torch
+import torch.nn as nn
+from base_model import ResNet34
+
+class SSD300(nn.Module):
+    """
+        Build a SSD module to take 300x300 image input,
+        and output 8732 per class bounding boxes
+
+        vggt: pretrained vgg16 (partial) model
+        label_num: number of classes (including background 0)
+    """
+    def __init__(self, label_num, backbone='resnet34', model_path="./resnet34-333f7ec4.pth"):
+
+        super(SSD300, self).__init__()
+
+        self.label_num = label_num
+
+        if backbone == 'resnet34':
+            self.model = ResNet34()
+            out_channels = 256
+            out_size = 38
+            self.out_chan = [out_channels, 512, 512, 256, 256, 256]
+        else:
+            raise ValueError('Invalid backbone chosen')
+
+        self._build_additional_features(out_size, self.out_chan)
+
+        # after l2norm, conv7, conv8_2, conv9_2, conv10_2, conv11_2
+        # classifer 1, 2, 3, 4, 5 ,6
+
+        self.num_defaults = [4, 6, 6, 6, 4, 4]
+        self.loc = []
+        self.conf = []
+
+        for nd, oc in zip(self.num_defaults, self.out_chan):
+            self.loc.append(nn.Conv2d(oc, nd*4, kernel_size=3, padding=1))
+            self.conf.append(nn.Conv2d(oc, nd*label_num, kernel_size=3, padding=1))
+
+
+        self.loc = nn.ModuleList(self.loc)
+        self.conf = nn.ModuleList(self.conf)
+        # intitalize all weights
+        self._init_weights()
+
+    def _build_additional_features(self, input_size, input_channels):
+        idx = 0
+        if input_size == 38:
+            idx = 0
+        elif input_size == 19:
+            idx = 1
+        elif input_size == 10:
+            idx = 2
+
+        self.additional_blocks = []
+
+        if input_size == 38:
+            self.additional_blocks.append(nn.Sequential(
+                nn.Conv2d(input_channels[idx], 256, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(256, input_channels[idx+1], kernel_size=3, padding=1, stride=2),
+                nn.ReLU(inplace=True),
+            ))
+            idx += 1
+
+        self.additional_blocks.append(nn.Sequential(
+            nn.Conv2d(input_channels[idx], 256, kernel_size=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, input_channels[idx+1], kernel_size=3, padding=1, stride=2),
+            nn.ReLU(inplace=True),
+        ))
+        idx += 1
+
+        # conv9_1, conv9_2
+        self.additional_blocks.append(nn.Sequential(
+            nn.Conv2d(input_channels[idx], 128, kernel_size=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, input_channels[idx+1], kernel_size=3, padding=1, stride=2),
+            nn.ReLU(inplace=True),
+        ))
+        idx += 1
+
+        # conv10_1, conv10_2
+        self.additional_blocks.append(nn.Sequential(
+            nn.Conv2d(input_channels[idx], 128, kernel_size=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, input_channels[idx+1], kernel_size=3),
+            nn.ReLU(inplace=True),
+        ))
+        idx += 1
+
+        # Only necessary in VGG for now
+        if input_size >= 19:
+            # conv11_1, conv11_2
+            self.additional_blocks.append(nn.Sequential(
+                nn.Conv2d(input_channels[idx], 128, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(128, input_channels[idx+1], kernel_size=3),
+                nn.ReLU(inplace=True),
+            ))
+
+        self.additional_blocks = nn.ModuleList(self.additional_blocks)
+
+    def _init_weights(self):
+
+        layers = [
+            *self.additional_blocks,
+            *self.loc, *self.conf]
+
+        for layer in layers:
+            for param in layer.parameters():
+                if param.dim() > 1: nn.init.xavier_uniform_(param)
+
+    # Shape the classifier to the view of bboxes
+    def bbox_view(self, src, loc, conf):
+        ret = []
+        for s, l, c in zip(src, loc, conf):
+            ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.label_num, -1)))
+
+        locs, confs = list(zip(*ret))
+        locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
+        return locs, confs
+
+    def forward(self, data):
+
+        layers = self.model(data)
+
+        # last result from network goes into additional blocks
+        x = layers[-1]
+        additional_results = []
+        for i, l in enumerate(self.additional_blocks):
+            x = l(x)
+            additional_results.append(x)
+
+        src = [*layers, *additional_results]
+        # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
+
+        locs, confs = self.bbox_view(src, self.loc, self.conf)
+
+        # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
+        return locs, confs
diff --git a/single_stage_detector/ssd/train.py b/single_stage_detector/ssd/train.py
new file mode 100644
index 000000000..37a447435
--- /dev/null
+++ b/single_stage_detector/ssd/train.py
@@ -0,0 +1,211 @@
+import os
+from argparse import ArgumentParser
+from utils import DefaultBoxes, Encoder, COCODetection
+from base_model import Loss
+from utils import SSDTransformer
+from ssd300 import SSD300
+import torch
+from torch.autograd import Variable
+from torch.utils.data import DataLoader
+import time
+import numpy as np
+
+
+def parse_args():
+    parser = ArgumentParser(description="Train Single Shot MultiBox Detector"
+                                        " on COCO")
+    parser.add_argument('--data', '-d', type=str, default='/coco',
+                        help='path to test and training data files')
+    parser.add_argument('--epochs', '-e', type=int, default=800,
+                        help='number of epochs for training')
+    parser.add_argument('--batch-size', '-b', type=int, default=32,
+                        help='number of examples for each iteration')
+    parser.add_argument('--no-cuda', action='store_true',
+                        help='use available GPUs')
+    parser.add_argument('--seed', '-s', type=int,
+                        help='manually set random seed for torch')
+    parser.add_argument('--threshold', '-t', type=float, default=0.212,
+                        help='stop training early at threshold')
+    parser.add_argument('--iteration', type=int, default=0,
+                        help='iteration to start from')
+    parser.add_argument('--checkpoint', type=str, default=None,
+                        help='path to model checkpoint file')
+    parser.add_argument('--no-save', action='store_true',
+                        help='save model checkpoints')
+    parser.add_argument('--evaluation', nargs='*', type=int,
+                        default=[120000, 160000, 180000, 200000, 220000, 240000],
+                        help='iterations at which to evaluate')
+    return parser.parse_args()
+
+
+def show_memusage(device=0):
+    import gpustat
+    gpu_stats = gpustat.GPUStatCollection.new_query()
+    item = gpu_stats.jsonify()["gpus"][device]
+    print("{}/{}".format(item["memory.used"], item["memory.total"]))
+
+
+def dboxes300_coco():
+    figsize = 300
+    feat_size = [38, 19, 10, 5, 3, 1]
+    steps = [8, 16, 32, 64, 100, 300]
+    # use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
+    scales = [21, 45, 99, 153, 207, 261, 315]
+    aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+    dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
+    return dboxes
+
+
+def coco_eval(model, coco, cocoGt, encoder, inv_map, threshold, use_cuda=True):
+    from pycocotools.cocoeval import COCOeval
+    print("")
+    model.eval()
+    if use_cuda:
+        model.cuda()
+    ret = []
+    start = time.time()
+    for idx, image_id in enumerate(coco.img_keys):
+        img, (htot, wtot), _, _ = coco[idx]
+
+        with torch.no_grad():
+            print("Parsing image: {}/{}".format(idx+1, len(coco)), end="\r")
+            inp = img.unsqueeze(0)
+            if use_cuda:
+                inp = inp.cuda()
+            ploc, plabel = model(inp)
+
+            try:
+                result = encoder.decode_batch(ploc, plabel, 0.50, 200)[0]
+            except:
+                #raise
+                print("")
+                print("No object detected in idx: {}".format(idx))
+                continue
+
+            loc, label, prob = [r.cpu().numpy() for r in result]
+            for loc_, label_, prob_ in zip(loc, label, prob):
+                ret.append([image_id, loc_[0]*wtot, \
+                                      loc_[1]*htot,
+                                      (loc_[2] - loc_[0])*wtot,
+                                      (loc_[3] - loc_[1])*htot,
+                                      prob_,
+                                      inv_map[label_]])
+    print("")
+    print("Predicting Ended, total time: {:.2f} s".format(time.time()-start))
+
+    cocoDt = cocoGt.loadRes(np.array(ret))
+
+    E = COCOeval(cocoGt, cocoDt, iouType='bbox')
+    E.evaluate()
+    E.accumulate()
+    E.summarize()
+    print("Current AP: {:.5f} AP goal: {:.5f}".format(E.stats[0], threshold))
+    return (E.stats[0] >= threshold) #Average Precision  (AP) @[ IoU=050:0.95 | area=   all | maxDets=100 ]
+
+
+
+def train300_mlperf_coco(args):
+    from coco import COCO
+    # Check that GPUs are actually available
+    use_cuda = not args.no_cuda and torch.cuda.is_available()
+    dboxes = dboxes300_coco()
+    encoder = Encoder(dboxes)
+    train_trans = SSDTransformer(dboxes, (300, 300), val=False)
+    val_trans = SSDTransformer(dboxes, (300, 300), val=True)
+
+    val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
+    val_coco_root = os.path.join(args.data, "val2017")
+    train_annotate = os.path.join(args.data, "annotations/instances_train2017.json")
+    train_coco_root = os.path.join(args.data, "train2017")
+
+    cocoGt = COCO(annotation_file=val_annotate)
+    val_coco = COCODetection(val_coco_root, val_annotate, val_trans)
+    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)
+
+    #print("Number of labels: {}".format(train_coco.labelnum))
+    train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4)
+
+    ssd300 = SSD300(train_coco.labelnum)
+    if args.checkpoint is not None:
+        print("loading model checkpoint", args.checkpoint)
+        od = torch.load(args.checkpoint)
+        ssd300.load_state_dict(od["model"])
+    ssd300.train()
+    if use_cuda:
+        ssd300.cuda()
+    loss_func = Loss(dboxes)
+    if use_cuda:
+        loss_func.cuda()
+
+    optim = torch.optim.SGD(ssd300.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4)
+    print("epoch", "nbatch", "loss")
+
+    iter_num = args.iteration
+    avg_loss = 0.0
+    inv_map = {v:k for k,v in val_coco.label_map.items()}
+
+    for epoch in range(args.epochs):
+
+        for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader):
+
+            if iter_num == 160000:
+                print("")
+                print("lr decay step #1")
+                for param_group in optim.param_groups:
+                    param_group['lr'] = 1e-4
+
+            if iter_num == 200000:
+                print("")
+                print("lr decay step #2")
+                for param_group in optim.param_groups:
+                    param_group['lr'] = 1e-5
+
+            if use_cuda:
+                img = img.cuda()
+            img = Variable(img, requires_grad=True)
+            ploc, plabel = ssd300(img)
+            trans_bbox = bbox.transpose(1,2).contiguous()
+            if use_cuda:
+                trans_bbox = trans_bbox.cuda()
+                label = label.cuda()
+            gloc, glabel = Variable(trans_bbox, requires_grad=False), \
+                           Variable(label, requires_grad=False)
+            loss = loss_func(ploc, plabel, gloc, glabel)
+
+            if not np.isinf(loss.item()): avg_loss = 0.999*avg_loss + 0.001*loss.item()
+
+            print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\
+                        .format(iter_num, loss.item(), avg_loss), end="\r")
+            optim.zero_grad()
+            loss.backward()
+            optim.step()
+
+            if iter_num in args.evaluation:
+                if not args.no_save:
+                    print("")
+                    print("saving model...")
+                    torch.save({"model" : ssd300.state_dict(), "label_map": train_coco.label_info},
+                                "./models/iter_{}.pt".format(iter_num))
+
+                if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold):
+                    return
+
+            iter_num += 1
+
+def main():
+    args = parse_args()
+
+    if not os.path.isdir('./models'):
+        os.mkdir('./models')
+
+    if args.seed is not None:
+        print("Using seed = {}".format(args.seed))
+        torch.manual_seed(args.seed)
+        np.random.seed(seed=args.seed)
+
+
+    torch.backends.cudnn.benchmark = True
+    train300_mlperf_coco(args)
+
+if __name__ == "__main__":
+    main()
diff --git a/single_stage_detector/ssd/utils.py b/single_stage_detector/ssd/utils.py
new file mode 100644
index 000000000..e1a0901ab
--- /dev/null
+++ b/single_stage_detector/ssd/utils.py
@@ -0,0 +1,772 @@
+import torch
+import torchvision
+import torchvision.transforms as transforms
+import torch.utils.data as data
+from PIL import Image
+from xml.etree import ElementTree
+import os
+import glob
+from pathlib import Path
+import numpy as np
+import random
+import itertools
+import torch.nn.functional as F
+import json
+import time
+import bz2
+import pickle
+from math import sqrt, ceil
+
+# This function is from https://github.com/kuangliu/pytorch-ssd.
+def calc_iou_tensor(box1, box2):
+    """ Calculation of IoU based on two boxes tensor,
+        Reference to https://github.com/kuangliu/pytorch-ssd
+        input:
+            box1 (N, 4) 
+            box2 (M, 4)
+        output:
+            IoU (N, M)
+    """
+    N = box1.size(0)
+    M = box2.size(0)
+
+    be1 = box1.unsqueeze(1).expand(-1, M, -1)
+    be2 = box2.unsqueeze(0).expand(N, -1, -1)
+
+    # Left Top & Right Bottom
+    lt = torch.max(be1[:,:,:2], be2[:,:,:2])
+    #mask1 = (be1[:,:, 0] < be2[:,:, 0]) ^ (be1[:,:, 1] < be2[:,:, 1])
+    #mask1 = ~mask1
+    rb = torch.min(be1[:,:,2:], be2[:,:,2:])
+    #mask2 = (be1[:,:, 2] < be2[:,:, 2]) ^ (be1[:,:, 3] < be2[:,:, 3])
+    #mask2 = ~mask2
+
+    delta = rb - lt
+    delta[delta < 0] = 0
+    intersect = delta[:,:,0]*delta[:,:,1]
+    #*mask1.float()*mask2.float()
+
+    delta1 = be1[:,:,2:] - be1[:,:,:2]
+    area1 = delta1[:,:,0]*delta1[:,:,1]
+    delta2 = be2[:,:,2:] - be2[:,:,:2]
+    area2 = delta2[:,:,0]*delta2[:,:,1]
+
+    iou = intersect/(area1 + area2 - intersect)
+    return iou
+
+# This function is from https://github.com/kuangliu/pytorch-ssd.
+class Encoder(object):
+    """
+        Inspired by https://github.com/kuangliu/pytorch-ssd
+        Transform between (bboxes, lables) <-> SSD output
+        
+        dboxes: default boxes in size 8732 x 4, 
+            encoder: input ltrb format, output xywh format
+            decoder: input xywh format, output ltrb format 
+        
+        encode: 
+            input  : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
+            output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
+            criteria : IoU threshold of bboexes
+
+        decode:
+            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
+            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
+            criteria : IoU threshold of bboexes
+            max_output : maximum number of output bboxes
+    """
+
+    def __init__(self, dboxes):
+        self.dboxes = dboxes(order="ltrb")
+        self.dboxes_xywh = dboxes(order="xywh").unsqueeze(dim=0)
+        self.nboxes = self.dboxes.size(0)
+        #print("# Bounding boxes: {}".format(self.nboxes))
+        self.scale_xy = dboxes.scale_xy
+        self.scale_wh = dboxes.scale_wh
+    
+    def encode(self, bboxes_in, labels_in, criteria = 0.5):
+
+        ious = calc_iou_tensor(bboxes_in, self.dboxes)
+        best_dbox_ious, best_dbox_idx = ious.max(dim=0)
+        best_bbox_ious, best_bbox_idx = ious.max(dim=1)
+
+        # set best ious 2.0
+        best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)
+
+        idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
+        best_dbox_idx[best_bbox_idx[idx]] = idx
+
+        # filter IoU > 0.5
+        masks = best_dbox_ious > criteria
+        labels_out = torch.zeros(self.nboxes, dtype=torch.long)
+        #print(maxloc.shape, labels_in.shape, labels_out.shape)
+        labels_out[masks] = labels_in[best_dbox_idx[masks]]
+        bboxes_out = self.dboxes.clone()
+        bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
+        # Transform format to xywh format
+        x, y, w, h = 0.5*(bboxes_out[:, 0] + bboxes_out[:, 2]), \
+                     0.5*(bboxes_out[:, 1] + bboxes_out[:, 3]), \
+                     -bboxes_out[:, 0] + bboxes_out[:, 2], \
+                     -bboxes_out[:, 1] + bboxes_out[:, 3]
+        bboxes_out[:, 0] = x
+        bboxes_out[:, 1] = y
+        bboxes_out[:, 2] = w
+        bboxes_out[:, 3] = h
+        return bboxes_out, labels_out
+
+    def scale_back_batch(self, bboxes_in, scores_in):
+        """
+            Do scale and transform from xywh to ltrb
+            suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox
+        """
+        if bboxes_in.device == torch.device("cpu"):
+            self.dboxes = self.dboxes.cpu()
+            self.dboxes_xywh = self.dboxes_xywh.cpu()
+        else:
+            self.dboxes = self.dboxes.cuda()
+            self.dboxes_xywh = self.dboxes_xywh.cuda()
+            
+        bboxes_in = bboxes_in.permute(0, 2, 1)
+        scores_in = scores_in.permute(0, 2, 1)
+        #print(bboxes_in.device, scores_in.device, self.dboxes_xywh.device)
+
+        bboxes_in[:, :, :2] = self.scale_xy*bboxes_in[:, :, :2]
+        bboxes_in[:, :, 2:] = self.scale_wh*bboxes_in[:, :, 2:]
+
+        bboxes_in[:, :, :2] = bboxes_in[:, :, :2]*self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
+        bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp()*self.dboxes_xywh[:, :, 2:]
+
+        # Transform format to ltrb 
+        l, t, r, b = bboxes_in[:, :, 0] - 0.5*bboxes_in[:, :, 2],\
+                     bboxes_in[:, :, 1] - 0.5*bboxes_in[:, :, 3],\
+                     bboxes_in[:, :, 0] + 0.5*bboxes_in[:, :, 2],\
+                     bboxes_in[:, :, 1] + 0.5*bboxes_in[:, :, 3]
+
+        bboxes_in[:, :, 0] = l
+        bboxes_in[:, :, 1] = t
+        bboxes_in[:, :, 2] = r
+        bboxes_in[:, :, 3] = b
+
+        return bboxes_in, F.softmax(scores_in, dim=-1)
+   
+    def decode_batch(self, bboxes_in, scores_in,  criteria = 0.45, max_output=200):
+        bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
+
+        output = []
+        for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
+            bbox = bbox.squeeze(0)
+            prob = prob.squeeze(0)
+            output.append(self.decode_single(bbox, prob, criteria, max_output))
+            #print(output[-1])
+        return output
+
+    # perform non-maximum suppression
+    def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
+        # Reference to https://github.com/amdegroot/ssd.pytorch
+       
+        bboxes_out = []        
+        scores_out = []
+        labels_out = []
+
+        for i, score in enumerate(scores_in.split(1, 1)):
+            # skip background
+            # print(score[score>0.90])
+            if i == 0: continue
+            # print(i)
+            
+            score = score.squeeze(1)
+            mask = score > 0.05
+
+            bboxes, score = bboxes_in[mask, :], score[mask]
+            if score.size(0) == 0: continue
+
+            score_sorted, score_idx_sorted = score.sort(dim=0)
+
+            # select max_output indices
+            score_idx_sorted = score_idx_sorted[-max_num:]
+            candidates = []
+            #maxdata, maxloc = scores_in.sort()
+        
+            while score_idx_sorted.numel() > 0:
+                idx = score_idx_sorted[-1].item()
+                bboxes_sorted = bboxes[score_idx_sorted, :]
+                bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
+                iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()
+                # we only need iou < criteria 
+                score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
+                candidates.append(idx)
+
+            bboxes_out.append(bboxes[candidates, :])
+            scores_out.append(score[candidates])
+            labels_out.extend([i]*len(candidates))
+
+        bboxes_out, labels_out, scores_out = torch.cat(bboxes_out, dim=0), \
+               torch.tensor(labels_out, dtype=torch.long), \
+               torch.cat(scores_out, dim=0)
+
+
+        _, max_ids = scores_out.sort(dim=0)
+        max_ids = max_ids[-max_output:]
+        return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
+
+
+class DefaultBoxes(object):
+    def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, \
+                       scale_xy=0.1, scale_wh=0.2):
+
+        self.feat_size = feat_size
+        self.fig_size = fig_size
+
+        self.scale_xy_ = scale_xy
+        self.scale_wh_ = scale_wh
+        
+        # According to https://github.com/weiliu89/caffe
+        # Calculation method slightly different from paper
+        self.steps = steps
+        self.scales = scales
+
+        fk = fig_size/np.array(steps)
+        self.aspect_ratios = aspect_ratios
+
+        self.default_boxes = []
+        # size of feature and number of feature
+        for idx, sfeat in enumerate(self.feat_size):
+
+            sk1 = scales[idx]/fig_size
+            sk2 = scales[idx+1]/fig_size
+            sk3 = sqrt(sk1*sk2)
+            all_sizes = [(sk1, sk1), (sk3, sk3)]
+
+            for alpha in aspect_ratios[idx]:
+                w, h = sk1*sqrt(alpha), sk1/sqrt(alpha)
+                all_sizes.append((w, h))
+                all_sizes.append((h, w))
+            for w, h in all_sizes:
+                for i, j in itertools.product(range(sfeat), repeat=2):
+                    cx, cy = (j+0.5)/fk[idx], (i+0.5)/fk[idx]
+                    self.default_boxes.append((cx, cy, w, h)) 
+
+        self.dboxes = torch.tensor(self.default_boxes)
+        self.dboxes.clamp_(min=0, max=1)
+        # For IoU calculation
+        self.dboxes_ltrb = self.dboxes.clone()
+        self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5*self.dboxes[:, 2]
+        self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5*self.dboxes[:, 3]
+        self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5*self.dboxes[:, 2]
+        self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5*self.dboxes[:, 3]
+    
+    @property
+    def scale_xy(self):
+        return self.scale_xy_
+    
+    @property    
+    def scale_wh(self):
+        return self.scale_wh_
+
+    def __call__(self, order="ltrb"):
+        if order == "ltrb": return self.dboxes_ltrb
+        if order == "xywh": return self.dboxes
+
+
+# This function is from https://github.com/chauhan-utk/ssd.DomainAdaptation.
+class SSDCropping(object):
+    """ Cropping for SSD, according to original paper
+        Choose between following 3 conditions:
+        1. Preserve the original image
+        2. Random crop minimum IoU is among 0.1, 0.3, 0.5, 0.7, 0.9
+        3. Random crop
+        Reference to https://github.com/chauhan-utk/ssd.DomainAdaptation
+    """
+    def __init__(self):
+        
+        self.sample_options = (
+            # Do nothing
+            None,
+            # min IoU, max IoU
+            (0.1, None),
+            (0.3, None),
+            (0.5, None),
+            (0.7, None),
+            (0.9, None),
+            # no IoU requirements
+            (None, None),
+        )
+
+    def __call__(self, img, img_size, bboxes, labels):
+       
+        # Ensure always return cropped image
+        while True:
+            mode = random.choice(self.sample_options)
+            
+            if mode is None:
+                return img, img_size, bboxes, labels
+
+            htot, wtot = img_size
+           
+            min_iou, max_iou = mode
+            min_iou = float("-inf") if min_iou is None else min_iou
+            max_iou = float("+inf") if max_iou is None else max_iou
+            
+            # Implementation use 50 iteration to find possible candidate
+            for _ in range(50):
+                # suze of each sampled path in [0.1, 1] 0.3*0.3 approx. 0.1
+                w = random.uniform(0.3 , 1.0)
+                h = random.uniform(0.3 , 1.0)
+                
+                if w/h < 0.5 or w/h > 2:
+                    continue
+
+                # left 0 ~ wtot - w, top 0 ~ htot - h
+                left = random.uniform(0, 1.0 - w)
+                top = random.uniform(0, 1.0 - h)
+
+                right = left + w
+                bottom = top + h
+
+                ious = calc_iou_tensor(bboxes, torch.tensor([[left, top, right, bottom]]))
+               
+                # tailor all the bboxes and return
+                if not ((ious > min_iou) & (ious < max_iou)).all():
+                    continue
+
+                # discard any bboxes whose center not in the cropped image
+                xc = 0.5*(bboxes[:, 0] + bboxes[:, 2])
+                yc = 0.5*(bboxes[:, 1] + bboxes[:, 3])
+                
+                masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom)
+                
+                # if no such boxes, continue searching again
+                if not masks.any():
+                    continue
+                
+                bboxes[bboxes[:, 0] < left, 0] = left
+                bboxes[bboxes[:, 1] < top, 1] = top
+                bboxes[bboxes[:, 2] > right, 2] = right
+                bboxes[bboxes[:, 3] > bottom, 3] = bottom
+
+                #print(left, top, right, bottom)
+                #print(labels, bboxes, masks)
+                bboxes = bboxes[masks, :]
+                labels = labels[masks]
+
+                left_idx = int(left*wtot)
+                top_idx =  int(top*htot)
+                right_idx = int(right*wtot)
+                bottom_idx = int(bottom*htot)
+                #print(left_idx,top_idx,right_idx,bottom_idx)
+                #img = img[:, top_idx:bottom_idx, left_idx:right_idx]
+                img = img.crop((left_idx, top_idx, right_idx, bottom_idx))
+
+                bboxes[:, 0] = (bboxes[:, 0] - left)/w
+                bboxes[:, 1] = (bboxes[:, 1] - top)/h
+                bboxes[:, 2] = (bboxes[:, 2] - left)/w
+                bboxes[:, 3] = (bboxes[:, 3] - top)/h
+
+                htot = bottom_idx - top_idx
+                wtot = right_idx - left_idx
+                return img, (htot, wtot), bboxes, labels
+ 
+class ToTensor(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, img):
+        img = torch.Tensor(np.array(img))
+        # Transform from HWC to CHW
+        img = img.permute(2, 0 ,1)
+        return img
+
+class LightingNoice(object):
+    """
+        See this question, AlexNet data augumentation:
+        https://stackoverflow.com/questions/43328600
+    """
+    def __init__(self):
+        self.eigval = torch.tensor([55.46, 4.794, 1.148])
+        self.eigvec = torch.tensor([
+            [-0.5675, 0.7192, 0.4009],
+            [-0.5808, -0.0045, -0.8140],
+            [-0.5836, -0.6948, 0.4203]])
+
+    def __call__(self, img):
+        img = torch.Tensor(np.array(img))
+        # Transform from HWC to CHW
+        img = img.permute(2, 0 ,1)
+        return img
+        alpha0 = random.gauss(sigma=0.1, mu=0) 
+        alpha1 = random.gauss(sigma=0.1, mu=0) 
+        alpha2 = random.gauss(sigma=0.1, mu=0)
+       
+        channels = alpha0*self.eigval[0]*self.eigvec[0, :] + \
+                   alpha1*self.eigval[1]*self.eigvec[1, :] + \
+                   alpha2*self.eigval[2]*self.eigvec[2, :]
+        channels = channels.view(3, 1, 1)
+        img += channels
+        
+        return img
+        
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, image, bboxes):
+        if random.random() < self.p:
+            bboxes[:, 0], bboxes[:, 2] = 1.0 - bboxes[:, 2], 1.0 - bboxes[:, 0]
+            return image.transpose(Image.FLIP_LEFT_RIGHT), bboxes
+        return image, bboxes
+
+# Do data augumentation
+class SSDTransformer(object):
+    """ SSD Data Augumentation, according to original paper
+        Composed by several steps:
+        Cropping
+        Resize
+        Flipping
+        Jittering
+    """
+    def __init__(self, dboxes, size = (300, 300), val=False):
+
+        # define vgg16 mean 
+        self.size = size
+        self.val = val
+
+        self.dboxes_ = dboxes #DefaultBoxes300()
+        self.encoder = Encoder(self.dboxes_)
+
+        self.crop = SSDCropping()
+        self.img_trans = transforms.Compose([
+            transforms.Resize(self.size),
+            #transforms.Resize((300, 300)),
+            #transforms.RandomHorizontalFlip(),
+            transforms.ColorJitter(brightness=0.125, contrast=0.5, 
+                saturation=0.5, hue=0.05
+            ),
+            transforms.ToTensor()
+            #LightingNoice(),
+        ]) 
+        self.hflip = RandomHorizontalFlip()
+
+        # All Pytorch Tensor will be normalized
+        # https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683
+        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                               std=[0.229, 0.224, 0.225])
+        #self.normalize = transforms.Normalize(mean = [104.0, 117.0, 123.0],
+        #                                      std = [1.0, 1.0, 1.0])
+
+        self.trans_val = transforms.Compose([
+            transforms.Resize(self.size),
+            transforms.ToTensor(),
+            #ToTensor(),
+            self.normalize,])
+    
+    @property
+    def dboxes(self):
+        return self.dboxes_
+
+    def __call__(self, img, img_size, bbox=None, label=None, max_num=200):
+        #img = torch.tensor(img)
+        if self.val:
+            bbox_out = torch.zeros(max_num, 4)
+            label_out =  torch.zeros(max_num, dtype=torch.long)
+            bbox_out[:bbox.size(0), :] = bbox
+            label_out[:label.size(0)] = label
+            return self.trans_val(img), img_size, bbox_out, label_out
+   
+        #print("before", img.size, bbox)
+        img, img_size, bbox, label = self.crop(img, img_size, bbox, label)
+        #print("after", img.size, bbox)
+        img, bbox = self.hflip(img, bbox)
+
+        img = self.img_trans(img).contiguous()
+        #img = img.contiguous().div(255)
+        img = self.normalize(img)
+
+        bbox, label = self.encoder.encode(bbox, label)
+
+        return img, img_size, bbox, label
+
+# Implement a datareader for COCO dataset
+class COCODetection(data.Dataset):
+    def __init__(self, img_folder, annotate_file, transform=None):
+        self.img_folder = img_folder
+        self.annotate_file = annotate_file
+
+        # Start processing annotation
+        with open(annotate_file) as fin:
+            self.data = json.load(fin)
+
+        self.images = {}
+
+        self.label_map = {}
+        self.label_info = {}
+        #print("Parsing COCO data...")
+        start_time = time.time()
+        # 0 stand for the background
+        cnt = 0
+        self.label_info[cnt] = "background"
+        for cat in self.data["categories"]:
+            cnt += 1
+            self.label_map[cat["id"]] = cnt
+            self.label_info[cnt] = cat["name"]
+
+        # build inference for images
+        for img in self.data["images"]:
+            img_id = img["id"]
+            img_name = img["file_name"]
+            img_size = (img["height"],img["width"])
+            #print(img_name)
+            if img_id in self.images: raise Exception("dulpicated image record")
+            self.images[img_id] = (img_name, img_size, [])
+
+        # read bboxes
+        for bboxes in self.data["annotations"]:
+            img_id = bboxes["image_id"]
+            category_id = bboxes["category_id"]
+            bbox = bboxes["bbox"]
+            bbox_label = self.label_map[bboxes["category_id"]]
+            self.images[img_id][2].append((bbox, bbox_label))
+
+        for k, v in list(self.images.items()):
+            if len(v[2]) == 0:
+                #print("empty image: {}".format(k))
+                self.images.pop(k)
+
+        self.img_keys = list(self.images.keys())
+        self.transform = transform
+        #print("End parsing COCO data, total time {}".format(time.time()-start_time))
+
+    @property
+    def labelnum(self):
+        return len(self.label_info)
+
+    @staticmethod
+    def load(pklfile):  
+        #print("Loading from {}".format(pklfile))
+        with bz2.open(pklfile, "rb") as fin:
+            ret = pickle.load(fin)
+        return ret
+
+    def save(self, pklfile):
+        #print("Saving to {}".format(pklfile))
+        with bz2.open(pklfile, "wb") as fout:
+            pickle.dump(self, fout)
+
+    
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        img_id = self.img_keys[idx]
+        img_data = self.images[img_id]
+        fn = img_data[0]
+        img_path = os.path.join(self.img_folder, fn)
+        img = Image.open(img_path).convert("RGB")
+
+        htot, wtot = img_data[1]
+        bbox_sizes = []
+        bbox_labels = []
+
+        #for (xc, yc, w, h), bbox_label in img_data[2]:
+        for (l,t,w,h), bbox_label in img_data[2]:
+            r = l + w
+            b = t + h
+            #l, t, r, b = xc - 0.5*w, yc - 0.5*h, xc + 0.5*w, yc + 0.5*h
+            bbox_size = (l/wtot, t/htot, r/wtot, b/htot)
+            bbox_sizes.append(bbox_size)
+            bbox_labels.append(bbox_label) 
+
+        bbox_sizes = torch.tensor(bbox_sizes)
+        bbox_labels =  torch.tensor(bbox_labels)
+
+
+        if self.transform != None:
+            img, (htot, wtot), bbox_sizes, bbox_labels = \
+                self.transform(img, (htot, wtot), bbox_sizes, bbox_labels)
+        else:
+            pass
+
+        return img, (htot, wtot), bbox_sizes, bbox_labels    
+
+# Implement a datareader for VOC dataset
+class VOCDetection(data.Dataset):
+    """  VOC PASCAL 07/12 DataReader
+         params:
+            img:        image folder
+            annotate:   annotation folder (xml)
+    """
+    def __init__(self, img_folder, annotate_folder, file_filter, transform=None, label_map = {}, difficult=True):
+        #print("Reading data informations")
+
+        self.img_folder = img_folder
+        self.annotate_folder = annotate_folder
+        self.transform = transform
+        self.difficult = difficult
+        self.file_filter = file_filter
+        
+        # Read file filter to filter out files
+        with open(file_filter, "r") as fin:
+            self.filter = fin.read().strip().split("\n")
+
+        self.images = []
+        self.label_num = 0
+        self.label_map = {v:k for k, v in label_map.items()}
+
+        for xml_file in glob.glob(os.path.join(annotate_folder, "*.xml")):
+            ret = self._parse_xml(xml_file)
+            if ret:
+                self.images.append(ret)
+
+        self.label_map = {v:k for k, v in self.label_map.items()}
+        # Add background label
+        self.label_map[0] = "background"
+        self.label_num += 1
+        #print("Finished Reading")
+    
+    def _parse_xml(self, xml_file):
+        #print(xml_file) 
+        root = ElementTree.ElementTree(file=xml_file)
+        img_name = root.find("filename").text
+        # Get basename
+        base_name = Path(img_name).resolve().stem
+        if base_name not in self.filter:
+            return []
+
+        img_size = ( 
+            int(root.find("size").find("height").text) ,
+            int(root.find("size").find("width").text)  ,
+            int(root.find("size").find("depth").text)  , )
+
+        tmp_data = []
+        for obj in root.findall("object"):
+            # extract xmin, ymin, xmax, ymax
+            difficult = obj.find("difficult").text
+            if difficult == "1" and not self.difficult:
+                continue
+            bbox = (
+                int(obj.find("bndbox").find("xmin").text),
+                int(obj.find("bndbox").find("ymin").text),
+                int(obj.find("bndbox").find("xmax").text),
+                int(obj.find("bndbox").find("ymax").text), )
+            bbox_label = obj.find("name").text
+            if bbox_label in self.label_map:
+                bbox_label = self.label_map[bbox_label]
+            else:
+                self.label_num += 1
+                self.label_map[bbox_label] = self.label_num
+                bbox_label = self.label_num
+            tmp_data.append((bbox, bbox_label))
+
+        return (img_name, img_size, tmp_data)
+
+    def __getitem__(self, idx):
+       
+        image_info = self.images[idx]
+        #print(self.images)
+        #print(image_info)
+        img_path = os.path.join(self.img_folder, image_info[0])
+        #img = np.array(Image.open(img_path).convert('RGB'))
+        img = Image.open(img_path)
+
+        # Assert the record in xml and image matches
+        # assert img.size == image_info[1], "Image Size Does Not Match!"
+       
+        htot, wtot, _ = image_info[1] 
+
+        bbox_sizes = []
+        bbox_labels = []
+
+        for (xmin, ymin, xmax, ymax), bbox_label in image_info[2]:
+            #cx, cy, w, h = (xmin + xmax)/2, (ymin + ymax)/2, xmax - xmin, ymax - ymin 
+            #bbox_size = (cx, cy, w, h)
+            #print(cx, cy, w, h)
+            #bbox_size = (cx/wtot, cy/htot, w/wtot, h/htot)
+            l, t, r, b = xmin, ymin, xmax, ymax
+            bbox_size = (l/wtot, t/htot, r/wtot, b/htot)
+            bbox_sizes.append(bbox_size)
+            #bbox_labels.append(self.label_map[bbox_label])
+            bbox_labels.append(bbox_label)
+
+        bbox_sizes = torch.tensor(bbox_sizes)
+        bbox_labels =  torch.tensor(bbox_labels)
+        #bbox_size = (xmin, ymin, xmax, ymax)
+        #bbox_label = bbox_info[3]
+        
+        # Perform image transformation 
+        if self.transform != None:
+            img, (htot, wtot), bbox_sizes, bbox_labels = \
+                self.transform(img, (htot, wtot), bbox_sizes, bbox_labels)
+        else:
+            pass
+
+        #print(img.shape, bbox_sizes.shape, bbox_labels.shape)
+        #print(idx, "non_bg:", (bbox_labels > 0).sum().item())
+        #print(img.shape)
+        return img, (htot, wtot), bbox_sizes, bbox_labels
+
+    def __len__(self):
+        return len(self.images)
+
+
+def draw_patches(img, bboxes, labels, order="xywh", label_map={}):
+
+    import matplotlib.pyplot as plt
+    import matplotlib.patches as patches
+    # Suppose bboxes in fractional coordinate:
+    # cx, cy, w, h
+    # img = img.numpy()
+    img = np.array(img)
+    labels = np.array(labels)
+    bboxes = bboxes.numpy() 
+
+    if label_map:
+        labels = [label_map.get(l) for l in labels]
+
+    if order == "ltrb":
+        xmin, ymin, xmax, ymax = bboxes[:, 0],  bboxes[:, 1],  bboxes[:, 2],  bboxes[:, 3]
+        cx, cy, w, h = (xmin + xmax)/2, (ymin + ymax)/2, xmax - xmin, ymax - ymin
+    else:
+        cx, cy, w, h = bboxes[:, 0],  bboxes[:, 1],  bboxes[:, 2],  bboxes[:, 3]
+
+    htot, wtot,_ = img.shape
+    cx *= wtot
+    cy *= htot
+    w *= wtot
+    h *= htot
+
+    bboxes = zip(cx, cy, w, h)
+
+    plt.imshow(img)
+    ax = plt.gca()
+    for (cx, cy, w, h), label in zip(bboxes, labels):
+        if label == "background": continue
+        ax.add_patch(patches.Rectangle((cx-0.5*w, cy-0.5*h), 
+                                        w, h, fill=False, color="r"))
+        bbox_props = dict(boxstyle="round", fc="y", ec="0.5", alpha=0.3)
+        ax.text(cx-0.5*w, cy-0.5*h, label, ha="center", va="center", size=15, bbox=bbox_props) 
+    plt.show()
+
+
+if __name__ == "__main__":
+
+    #trans = SSDTransformer()
+    #vd = VOCDetection("../../VOCdevkit/VOC2007/JPEGImages",
+    #                  "../../VOCdevkit/VOC2007/Annotations",
+    #                  "../../VOCdevkit/VOC2007/ImageSets/Main/trainval.txt",
+    #                  transform = trans)
+
+    #imgs, img_size, bbox, label = vd[0]
+    #img = imgs[:, :, :]
+    #img *= torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
+    #img += torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
+    #img = img.permute(1, 2, 0)
+    #print(bbox[label>0], label[label>0])
+    #draw_patches(img, bbox[label>0], label[label>0], order="xywh", label_map=vd.label_map)
+
+    annotate = "../../coco_ssd/instances_valminusminival2014.json"
+    coco_root = "../../coco_data/val2014"
+
+    coco = COCODetection(coco_root, annotate)
+    #coco.save("save.pb2")
+    print(len(coco))
+    #img, img_size, bbox, label = coco[2]
+    #draw_patches(img, bbox, label, order="ltrb", label_map=coco.label_info)