From e76d4cd9eb3bd101bace3d6ce7c51157b24289ec Mon Sep 17 00:00:00 2001
From: bearpaw <platero.yang@gmail.com>
Date: Wed, 7 Jun 2017 22:12:15 +0800
Subject: [PATCH] Clean the code. Add ResNeXt. Add new progress bar.

---
 .gitignore                                |   2 +
 .gitmodules                               |   3 +
 TRAINING.md                               |   8 +
 checkpoints/resnext-8x64d/log.txt         |   1 +
 cifar.py                                  | 175 ++++++++----
 exp/cifar_train_softatt_residual_false.sh |  13 +
 models/__init__.py                        |  11 +-
 models/alexnet.py                         |  17 +-
 models/densenet.py                        | 157 -----------
 models/hourglass.py                       | 142 ----------
 models/inception.py                       | 326 ----------------------
 models/modules.py                         | 103 -------
 models/preresnet.py                       | 182 ------------
 models/resadvnet.py                       | 268 ------------------
 models/resattnet.py                       | 164 -----------
 models/resnet.py                          |   1 -
 models/resnext.py                         | 126 +++++++++
 models/ressoftattnet.py                   | 200 -------------
 models/squeezenet.py                      | 130 ---------
 utils/__init__.py                         |   8 +-
 utils/eval.py                             |  18 ++
 utils/misc.py                             | 105 ++-----
 utils/progress                            |   1 +
 23 files changed, 329 insertions(+), 1832 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 TRAINING.md
 create mode 100644 checkpoints/resnext-8x64d/log.txt
 create mode 100755 exp/cifar_train_softatt_residual_false.sh
 delete mode 100644 models/densenet.py
 delete mode 100644 models/hourglass.py
 delete mode 100644 models/inception.py
 delete mode 100644 models/modules.py
 delete mode 100644 models/preresnet.py
 delete mode 100644 models/resadvnet.py
 delete mode 100644 models/resattnet.py
 create mode 100644 models/resnext.py
 delete mode 100644 models/ressoftattnet.py
 delete mode 100644 models/squeezenet.py
 create mode 100644 utils/eval.py
 create mode 160000 utils/progress

diff --git a/.gitignore b/.gitignore
index 8866be1..0ba499a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@ checkpoint
 data
 cifar-debug.py
 test.eps
+dev
+monitor.py
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..5bcbdeb
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "utils/progress"]
+	path = utils/progress
+	url = https://github.com/verigak/progress.git
diff --git a/TRAINING.md b/TRAINING.md
new file mode 100644
index 0000000..03938ad
--- /dev/null
+++ b/TRAINING.md
@@ -0,0 +1,8 @@
+# Training recipes
+
+## CIFAR-10
+
+ResNet-110
+```sh
+CUDA_VISIBLE_DEVICES=0,1 python cifar.py -a  -dataset cifar10 -nGPU 2 -batchSize 128 -depth 110
+```
\ No newline at end of file
diff --git a/checkpoints/resnext-8x64d/log.txt b/checkpoints/resnext-8x64d/log.txt
new file mode 100644
index 0000000..94664ed
--- /dev/null
+++ b/checkpoints/resnext-8x64d/log.txt
@@ -0,0 +1 @@
+LR	Train Loss	Valid Loss	Train Acc.	Valid Acc.	
diff --git a/cifar.py b/cifar.py
index 0b1cdaa..dc99544 100644
--- a/cifar.py
+++ b/cifar.py
@@ -36,10 +36,9 @@
 import torch.utils.data as data
 import torchvision.transforms as transforms
 import torchvision.datasets as datasets
-# import torchvision.models as models
 import models
 
-from utils import *
+from utils import Bar, Logger, AverageMeter, accuracy, mkdir_p
 
 
 model_names = sorted(name for name in models.__dict__
@@ -47,15 +46,12 @@
     and callable(models.__dict__[name]))
 
 parser = argparse.ArgumentParser(description='PyTorch CIFAR10/100 Training')
-parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet20',
-                    choices=model_names,
-                    help='model architecture: ' +
-                        ' | '.join(model_names) +
-                        ' (default: resnet18)')
+# Datasets
 parser.add_argument('-d', '--dataset', default='cifar10', type=str)
 parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                     help='number of data loading workers (default: 4)')
-parser.add_argument('--epochs', default=164, type=int, metavar='N',
+# Optimization options
+parser.add_argument('--epochs', default=300, type=int, metavar='N',
                     help='number of total epochs to run')
 parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
                     help='manual epoch number (useful on restarts)')
@@ -65,19 +61,34 @@
                     help='test batchsize')
 parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                     metavar='LR', help='initial learning rate')
+parser.add_argument('--schedule', type=int, nargs='+', default=[150, 225],
+                        help='Decrease learning rate at these epochs.')
+parser.add_argument('--gamma', type=float, default=0.1, help='LR is multiplied by gamma on schedule.')
 parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                     help='momentum')
-parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
+parser.add_argument('--weight-decay', '--wd', default=5e-4, type=float,
                     metavar='W', help='weight decay (default: 1e-4)')
+# Checkpoints
 parser.add_argument('-c', '--checkpoint', default='checkpoint', type=str, metavar='PATH',
                     help='path to save checkpoint (default: checkpoint)')
 parser.add_argument('--resume', default='', type=str, metavar='PATH',
                     help='path to latest checkpoint (default: none)')
+# Architecture
+parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet20',
+                    choices=model_names,
+                    help='model architecture: ' +
+                        ' | '.join(model_names) +
+                        ' (default: resnet18)')
+parser.add_argument('--depth', type=int, default=29, help='Model depth.')
+parser.add_argument('--cardinality', type=int, default=8, help='Model cardinality (group).')
+parser.add_argument('--widen-factor', type=int, default=4, help='Widen factor. 4 -> 64, 8 -> 128, ...')
+# Miscs
+parser.add_argument('--manualSeed', type=int, help='manual seed')
 parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                     help='evaluate model on validation set')
-parser.add_argument('--manualSeed', type=int, help='manual seed')
 
 args = parser.parse_args()
+state = {k: v for k, v in args._get_kwargs()}
 
 # Validate dataset
 assert args.dataset == 'cifar10' or args.dataset == 'cifar100', 'Dataset can only be cifar10 or cifar100.'
@@ -133,7 +144,15 @@ def main():
 
     # Model   
     print("=> creating model '{}'".format(args.arch))
-    model = models.__dict__[args.arch](num_classes=num_classes)
+    if args.arch == 'resnext':
+        model = models.__dict__[args.arch](
+                    cardinality=args.cardinality,
+                    num_classes=num_classes,
+                    depth=args.depth,
+                    widen_factor=args.widen_factor
+                )
+    else:
+        model = models.__dict__[args.arch](num_classes=num_classes)
     if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
         model.features = torch.nn.DataParallel(model.features)
         model.cuda()
@@ -159,7 +178,7 @@ def main():
         logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True)
     else:
         logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title)
-        logger.set_names(['Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.'])
+        logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.'])
 
 
     if args.evaluate:
@@ -170,18 +189,15 @@ def main():
 
     # Train and val
     for epoch in range(start_epoch, args.epochs):
-        lr = adjust_learning_rate(optimizer, epoch)
+        adjust_learning_rate(optimizer, epoch)
 
-        print('\nEpoch: [%d | %d] LR: %f' % (epoch, args.epochs, lr))
+        print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr']))
 
         train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda)
         test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda)
 
-        print(' Train Loss: %.8f, Train Acc: %.2f' % (train_loss, train_acc*100))
-        print(' Test Loss:  %.8f, Test Acc:  %.2f' % (test_loss, test_acc*100))
-
         # append logger file
-        logger.append([train_loss, test_loss, train_acc, test_acc])
+        logger.append([state['lr'], train_loss, test_loss, train_acc, test_acc])
 
         # save model
         is_best = test_acc > best_acc
@@ -202,50 +218,111 @@ def main():
     print(best_acc)
 
 def train(trainloader, model, criterion, optimizer, epoch, use_cuda):
+    # switch to train mode
     model.train()
-    train_loss = 0
-    correct = 0
-    total = 0
+
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+    end = time.time()
+
+    bar = Bar('Processing', max=len(trainloader))
     for batch_idx, (inputs, targets) in enumerate(trainloader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
         if use_cuda:
-            inputs, targets = inputs.cuda(), targets.cuda()
-        optimizer.zero_grad()
+            inputs, targets = inputs.cuda(), targets.cuda(async=True)
         inputs, targets = torch.autograd.Variable(inputs), torch.autograd.Variable(targets)
+
+        # compute output
         outputs = model(inputs)
         loss = criterion(outputs, targets)
+
+        # measure accuracy and record loss
+        prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
+        losses.update(loss.data[0], inputs.size(0))
+        top1.update(prec1[0], inputs.size(0))
+        top5.update(prec5[0], inputs.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
         loss.backward()
         optimizer.step()
 
-        train_loss += loss.data[0]
-        _, predicted = torch.max(outputs.data, 1)
-        total += targets.size(0)
-        correct += predicted.eq(targets.data).cpu().sum()
-
-        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
-            % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
-    return (train_loss/total, correct*1.0/total)
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        # plot progress
+        bar.suffix  = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format(
+                    batch=batch_idx + 1,
+                    size=len(trainloader),
+                    data=data_time.avg,
+                    bt=batch_time.avg,
+                    total=bar.elapsed_td,
+                    eta=bar.eta_td,
+                    loss=losses.avg,
+                    top1=top1.avg,
+                    top5=top5.avg,
+                    )
+        bar.next()
+    bar.finish()
+    return (losses.avg, top1.avg)
 
 def test(testloader, model, criterion, epoch, use_cuda):
     global best_acc
+
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+
+    # switch to evaluate mode
     model.eval()
-    test_loss = 0
-    correct = 0
-    total = 0
+
+    end = time.time()
+    bar = Bar('Processing', max=len(testloader))
     for batch_idx, (inputs, targets) in enumerate(testloader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
         if use_cuda:
             inputs, targets = inputs.cuda(), targets.cuda()
         inputs, targets = torch.autograd.Variable(inputs, volatile=True), torch.autograd.Variable(targets)
+
+        # compute output
         outputs = model(inputs)
         loss = criterion(outputs, targets)
 
-        test_loss += loss.data[0]
-        _, predicted = torch.max(outputs.data, 1)
-        total += targets.size(0)
-        correct += predicted.eq(targets.data).cpu().sum()
-
-        progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
-            % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
-    return (test_loss/total, correct*1.0/total)
+        # measure accuracy and record loss
+        prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
+        losses.update(loss.data[0], inputs.size(0))
+        top1.update(prec1[0], inputs.size(0))
+        top5.update(prec5[0], inputs.size(0))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        # plot progress
+        bar.suffix  = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | top1: {top1: .4f} | top5: {top5: .4f}'.format(
+                    batch=batch_idx + 1,
+                    size=len(testloader),
+                    data=data_time.avg,
+                    bt=batch_time.avg,
+                    total=bar.elapsed_td,
+                    eta=bar.eta_td,
+                    loss=losses.avg,
+                    top1=top1.avg,
+                    top5=top5.avg,
+                    )
+        bar.next()
+    bar.finish()
+    return (losses.avg, top1.avg)
 
 def save_checkpoint(state, is_best, checkpoint='checkpoint', filename='checkpoint.pth.tar'):
     filepath = os.path.join(checkpoint, filename)
@@ -254,15 +331,11 @@ def save_checkpoint(state, is_best, checkpoint='checkpoint', filename='checkpoin
         shutil.copyfile(filepath, os.path.join(checkpoint, 'model_best.pth.tar'))
 
 def adjust_learning_rate(optimizer, epoch):
-    deday = 0
-    if epoch >= 122:
-        deday = 2
-    elif epoch >= 81:
-        deday = 1
-    lr = args.lr * (0.1 ** deday)
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = lr
-    return lr
+    global state
+    if epoch in args.schedule:
+        state['lr'] *= args.gamma
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = state['lr']
 
 if __name__ == '__main__':
     main()
\ No newline at end of file
diff --git a/exp/cifar_train_softatt_residual_false.sh b/exp/cifar_train_softatt_residual_false.sh
new file mode 100755
index 0000000..427ad46
--- /dev/null
+++ b/exp/cifar_train_softatt_residual_false.sh
@@ -0,0 +1,13 @@
+GPUID=$1
+DATASET=$2
+NET=$3
+
+cd ..
+
+CUDA_VISIBLE_DEVICES=$GPUID python cifar.py -d $DATASET -a ${NET}20 --checkpoint checkpoint/$DATASET/ResSoftAttNet_res_false/${NET}20 --manualSeed 1234
+CUDA_VISIBLE_DEVICES=$GPUID python cifar.py -d $DATASET -a ${NET}32 --checkpoint checkpoint/$DATASET/ResSoftAttNet_res_false/${NET}32 --manualSeed 1234
+CUDA_VISIBLE_DEVICES=$GPUID python cifar.py -d $DATASET -a ${NET}44 --checkpoint checkpoint/$DATASET/ResSoftAttNet_res_false/${NET}44 --manualSeed 1234
+CUDA_VISIBLE_DEVICES=$GPUID python cifar.py -d $DATASET -a ${NET}56 --checkpoint checkpoint/$DATASET/ResSoftAttNet_res_false/${NET}56 --manualSeed 1234
+CUDA_VISIBLE_DEVICES=$GPUID python cifar.py -d $DATASET -a ${NET}110 --checkpoint checkpoint/$DATASET/ResSoftAttNet_res_false/${NET}110 --manualSeed 1234
+
+cd -
\ No newline at end of file
diff --git a/models/__init__.py b/models/__init__.py
index a3f53cb..125337d 100644
--- a/models/__init__.py
+++ b/models/__init__.py
@@ -62,11 +62,12 @@
 from .alexnet import *
 from .vgg import *
 from .resnet import *
-from .preresnet import *
-from .hourglass import *
-from .resattnet import *
-from .ressoftattnet import *
-from .resadvnet import *
+from .resnext import *
+# from .preresnet import *
+# from .hourglass import *
+# from .resattnet import *
+# from .ressoftattnet import *
+# from .resadvnet import *
 # from .squeezenet import *
 # from .inception import *
 # from .densenet import *
diff --git a/models/alexnet.py b/models/alexnet.py
index 92ea27e..8c9407d 100644
--- a/models/alexnet.py
+++ b/models/alexnet.py
@@ -3,20 +3,14 @@
 (c) YANG, Wei 
 '''
 import torch.nn as nn
-import torch.utils.model_zoo as model_zoo
 
 
-__all__ = ['AlexNet', 'alexnet']
-
-
-model_urls = {
-    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
-}
+__all__ = ['alexnet']
 
 
 class AlexNet(nn.Module):
 
-    def __init__(self, num_classes=1000):
+    def __init__(self, num_classes=10):
         super(AlexNet, self).__init__()
         self.features = nn.Sequential(
             nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=5),
@@ -42,14 +36,9 @@ def forward(self, x):
         return x
 
 
-def alexnet(pretrained=False, **kwargs):
+def alexnet(**kwargs):
     r"""AlexNet model architecture from the
     `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
     """
     model = AlexNet(**kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['alexnet']))
     return model
diff --git a/models/densenet.py b/models/densenet.py
deleted file mode 100644
index 0b32d80..0000000
--- a/models/densenet.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.model_zoo as model_zoo
-from collections import OrderedDict
-
-__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
-
-
-model_urls = {
-    'densenet121': 'https://download.pytorch.org/models/densenet121-241335ed.pth',
-    'densenet169': 'https://download.pytorch.org/models/densenet169-6f0f7f60.pth',
-    'densenet201': 'https://download.pytorch.org/models/densenet201-4c113574.pth',
-    'densenet161': 'https://download.pytorch.org/models/densenet161-17b70270.pth',
-}
-
-
-def densenet121(pretrained=False, **kwargs):
-    r"""Densenet-121 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16))
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet121']))
-    return model
-
-
-def densenet169(pretrained=False, **kwargs):
-    r"""Densenet-169 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32))
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet169']))
-    return model
-
-
-def densenet201(pretrained=False, **kwargs):
-    r"""Densenet-201 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32))
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet201']))
-    return model
-
-
-def densenet161(pretrained=False, **kwargs):
-    r"""Densenet-161 model from
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24))
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['densenet161']))
-    return model
-
-
-class _DenseLayer(nn.Sequential):
-    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
-        super(_DenseLayer, self).__init__()
-        self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
-        self.add_module('relu.1', nn.ReLU(inplace=True)),
-        self.add_module('conv.1', nn.Conv2d(num_input_features, bn_size *
-                        growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
-        self.add_module('relu.2', nn.ReLU(inplace=True)),
-        self.add_module('conv.2', nn.Conv2d(bn_size * growth_rate, growth_rate,
-                        kernel_size=3, stride=1, padding=1, bias=False)),
-        self.drop_rate = drop_rate
-
-    def forward(self, x):
-        new_features = super(_DenseLayer, self).forward(x)
-        if self.drop_rate > 0:
-            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
-        return torch.cat([x, new_features], 1)
-
-
-class _DenseBlock(nn.Sequential):
-    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
-        super(_DenseBlock, self).__init__()
-        for i in range(num_layers):
-            layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
-            self.add_module('denselayer%d' % (i + 1), layer)
-
-
-class _Transition(nn.Sequential):
-    def __init__(self, num_input_features, num_output_features):
-        super(_Transition, self).__init__()
-        self.add_module('norm', nn.BatchNorm2d(num_input_features))
-        self.add_module('relu', nn.ReLU(inplace=True))
-        self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
-                                          kernel_size=1, stride=1, bias=False))
-        self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
-
-
-class DenseNet(nn.Module):
-    r"""Densenet-BC model class, based on
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
-
-    Args:
-        growth_rate (int) - how many filters to add each layer (`k` in paper)
-        block_config (list of 4 ints) - how many layers in each pooling block
-        num_init_features (int) - the number of filters to learn in the first convolution layer
-        bn_size (int) - multiplicative factor for number of bottle neck layers
-          (i.e. bn_size * k features in the bottleneck layer)
-        drop_rate (float) - dropout rate after each dense layer
-        num_classes (int) - number of classification classes
-    """
-    def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
-                 num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
-
-        super(DenseNet, self).__init__()
-
-        # First convolution
-        self.features = nn.Sequential(OrderedDict([
-            ('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
-            ('norm0', nn.BatchNorm2d(num_init_features)),
-            ('relu0', nn.ReLU(inplace=True)),
-            ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
-        ]))
-
-        # Each denseblock
-        num_features = num_init_features
-        for i, num_layers in enumerate(block_config):
-            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
-                                bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
-            self.features.add_module('denseblock%d' % (i + 1), block)
-            num_features = num_features + num_layers * growth_rate
-            if i != len(block_config) - 1:
-                trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
-                self.features.add_module('transition%d' % (i + 1), trans)
-                num_features = num_features // 2
-
-        # Final batch norm
-        self.features.add_module('norm5', nn.BatchNorm2d(num_features))
-
-        # Linear layer
-        self.classifier = nn.Linear(num_features, num_classes)
-
-    def forward(self, x):
-        features = self.features(x)
-        out = F.relu(features, inplace=True)
-        out = F.avg_pool2d(out, kernel_size=7).view(features.size(0), -1)
-        out = self.classifier(out)
-        return out
diff --git a/models/hourglass.py b/models/hourglass.py
deleted file mode 100644
index 14b7e81..0000000
--- a/models/hourglass.py
+++ /dev/null
@@ -1,142 +0,0 @@
-'''
-Hourglass network inserted in the pre-activated Resnet 
-Use lr=0.01 for current version
-(c) YANG, Wei 
-'''
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-from .preresnet import BasicBlock, Bottleneck
-
-__all__ = ['Hourglass', 'HourglassNet', 'hgnet20', 'hgnet32', 'hgnet44', 'hgnet56',
-           'hgnet110', 'hgnet1202']
-
-
-class Hourglass(nn.Module):
-    def __init__(self, block, num_blocks, planes, depth):
-        super(Hourglass, self).__init__()
-        self.depth = depth
-        self.residual = self._make_layer(block, num_blocks, planes)
-        self.bn = nn.BatchNorm2d(planes)
-        self.upsample = nn.UpsamplingNearest2d(scale_factor=2)
-
-    def _make_layer(self, block, num_blocks, planes):
-        layers = []
-        for i in range(0, num_blocks):
-            layers.append(block(planes*block.expansion, planes))
-        return nn.Sequential(*layers)
-
-    def _hour_glass(self, n, x):
-        up1 = self.residual(x)
-        low1 = F.max_pool2d(x, 2, stride=2)
-        low1 = self.residual(low1)
-
-        if n > 1:
-            low2 = self._hour_glass(n-1, low1)
-        else:
-            low2 = self.residual(low1)
-        low3 = self.residual(low2)
-        up2 = self.upsample(low3)
-        out = up1 + up2
-        return out
-
-    def forward(self, x):
-        return self._hour_glass(self.depth, x)
-
-class HourglassNet(nn.Module):
-
-    def __init__(self, block, layers, num_blocks=1, num_classes=1000):
-        self.inplanes = 16
-        super(HourglassNet, self).__init__()
-        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1,
-                               bias=False)
-        self.bn1 = nn.BatchNorm2d(16)
-        self.relu = nn.ReLU(inplace=True)
-        self.layer1 = self._make_layer(block, 16, layers[0], num_blocks=num_blocks, depth=3)
-        self.layer2 = self._make_layer(block, 32, layers[1], stride=2, num_blocks=num_blocks, depth=2)
-        self.layer3 = self._make_layer(block, 64, layers[2], stride=2, num_blocks=num_blocks, depth=1)
-        self.avgpool = nn.AvgPool2d(8)
-        self.fc = nn.Linear(64 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def _make_layer(self, block, planes, blocks, stride=1, num_blocks=1, depth=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
-
-        # add hourglass module
-        layers.append(Hourglass(block, num_blocks, planes, depth))
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)    # 32x32
-
-        x = self.layer1(x)  # 32x32
-        x = self.layer2(x)  # 16x16
-        x = self.layer3(x)  # 8x8
-
-        x = self.avgpool(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc(x)
-
-        return x
-
-
-def hgnet20(**kwargs):
-    """Constructs a HourglassNet-20 model.
-    """
-    model = HourglassNet(BasicBlock, [3, 3, 3], **kwargs)
-    return model
-
-
-def hgnet32(**kwargs):
-    """Constructs a HourglassNet-32 model.
-    """
-    model = HourglassNet(BasicBlock, [5, 5, 5], **kwargs)
-    return model
-
-
-def hgnet44(**kwargs):
-    """Constructs a HourglassNet-44 model.
-    """
-    model = HourglassNet(Bottleneck, [7, 7, 7], **kwargs)
-    return model
-
-
-def hgnet56(**kwargs):
-    """Constructs a HourglassNet-56 model.
-    """
-    model = HourglassNet(Bottleneck, [9, 9, 9], **kwargs)
-    return model
-
-
-def hgnet110(**kwargs):
-    """Constructs a HourglassNet-110 model.
-    """
-    model = HourglassNet(Bottleneck, [18, 18, 18], **kwargs)
-    return model
-
-def hgnet1202(**kwargs):
-    """Constructs a HourglassNet-1202 model.
-    """
-    model = HourglassNet(Bottleneck, [200, 200, 200], **kwargs)
-    return model
\ No newline at end of file
diff --git a/models/inception.py b/models/inception.py
deleted file mode 100644
index 43de39e..0000000
--- a/models/inception.py
+++ /dev/null
@@ -1,326 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.model_zoo as model_zoo
-
-
-__all__ = ['Inception3', 'inception_v3']
-
-
-model_urls = {
-    # Inception v3 ported from TensorFlow
-    'inception_v3_google': 'https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth',
-}
-
-
-def inception_v3(pretrained=False, **kwargs):
-    r"""Inception v3 model architecture from
-    `"Rethinking the Inception Architecture for Computer Vision" <http://arxiv.org/abs/1512.00567>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    if pretrained:
-        if 'transform_input' not in kwargs:
-            kwargs['transform_input'] = True
-        model = Inception3(**kwargs)
-        model.load_state_dict(model_zoo.load_url(model_urls['inception_v3_google']))
-        return model
-
-    return Inception3(**kwargs)
-
-
-class Inception3(nn.Module):
-
-    def __init__(self, num_classes=1000, aux_logits=True, transform_input=False):
-        super(Inception3, self).__init__()
-        self.aux_logits = aux_logits
-        self.transform_input = transform_input
-        self.Conv2d_1a_3x3 = BasicConv2d(3, 32, kernel_size=3, stride=2)
-        self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3)
-        self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1)
-        self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1)
-        self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3)
-        self.Mixed_5b = InceptionA(192, pool_features=32)
-        self.Mixed_5c = InceptionA(256, pool_features=64)
-        self.Mixed_5d = InceptionA(288, pool_features=64)
-        self.Mixed_6a = InceptionB(288)
-        self.Mixed_6b = InceptionC(768, channels_7x7=128)
-        self.Mixed_6c = InceptionC(768, channels_7x7=160)
-        self.Mixed_6d = InceptionC(768, channels_7x7=160)
-        self.Mixed_6e = InceptionC(768, channels_7x7=192)
-        if aux_logits:
-            self.AuxLogits = InceptionAux(768, num_classes)
-        self.Mixed_7a = InceptionD(768)
-        self.Mixed_7b = InceptionE(1280)
-        self.Mixed_7c = InceptionE(2048)
-        self.fc = nn.Linear(2048, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
-                import scipy.stats as stats
-                stddev = m.stddev if hasattr(m, 'stddev') else 0.1
-                X = stats.truncnorm(-2, 2, scale=stddev)
-                values = torch.Tensor(X.rvs(m.weight.data.numel()))
-                m.weight.data.copy_(values)
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def forward(self, x):
-        if self.transform_input:
-            x = x.clone()
-            x[:, 0] = x[:, 0] * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
-            x[:, 1] = x[:, 1] * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
-            x[:, 2] = x[:, 2] * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
-        # 299 x 299 x 3
-        x = self.Conv2d_1a_3x3(x)
-        # 149 x 149 x 32
-        x = self.Conv2d_2a_3x3(x)
-        # 147 x 147 x 32
-        x = self.Conv2d_2b_3x3(x)
-        # 147 x 147 x 64
-        x = F.max_pool2d(x, kernel_size=3, stride=2)
-        # 73 x 73 x 64
-        x = self.Conv2d_3b_1x1(x)
-        # 73 x 73 x 80
-        x = self.Conv2d_4a_3x3(x)
-        # 71 x 71 x 192
-        x = F.max_pool2d(x, kernel_size=3, stride=2)
-        # 35 x 35 x 192
-        x = self.Mixed_5b(x)
-        # 35 x 35 x 256
-        x = self.Mixed_5c(x)
-        # 35 x 35 x 288
-        x = self.Mixed_5d(x)
-        # 35 x 35 x 288
-        x = self.Mixed_6a(x)
-        # 17 x 17 x 768
-        x = self.Mixed_6b(x)
-        # 17 x 17 x 768
-        x = self.Mixed_6c(x)
-        # 17 x 17 x 768
-        x = self.Mixed_6d(x)
-        # 17 x 17 x 768
-        x = self.Mixed_6e(x)
-        # 17 x 17 x 768
-        if self.training and self.aux_logits:
-            aux = self.AuxLogits(x)
-        # 17 x 17 x 768
-        x = self.Mixed_7a(x)
-        # 8 x 8 x 1280
-        x = self.Mixed_7b(x)
-        # 8 x 8 x 2048
-        x = self.Mixed_7c(x)
-        # 8 x 8 x 2048
-        x = F.avg_pool2d(x, kernel_size=8)
-        # 1 x 1 x 2048
-        x = F.dropout(x, training=self.training)
-        # 1 x 1 x 2048
-        x = x.view(x.size(0), -1)
-        # 2048
-        x = self.fc(x)
-        # 1000 (num_classes)
-        if self.training and self.aux_logits:
-            return x, aux
-        return x
-
-
-class InceptionA(nn.Module):
-
-    def __init__(self, in_channels, pool_features):
-        super(InceptionA, self).__init__()
-        self.branch1x1 = BasicConv2d(in_channels, 64, kernel_size=1)
-
-        self.branch5x5_1 = BasicConv2d(in_channels, 48, kernel_size=1)
-        self.branch5x5_2 = BasicConv2d(48, 64, kernel_size=5, padding=2)
-
-        self.branch3x3dbl_1 = BasicConv2d(in_channels, 64, kernel_size=1)
-        self.branch3x3dbl_2 = BasicConv2d(64, 96, kernel_size=3, padding=1)
-        self.branch3x3dbl_3 = BasicConv2d(96, 96, kernel_size=3, padding=1)
-
-        self.branch_pool = BasicConv2d(in_channels, pool_features, kernel_size=1)
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch5x5 = self.branch5x5_1(x)
-        branch5x5 = self.branch5x5_2(branch5x5)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
-
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool(branch_pool)
-
-        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
-        return torch.cat(outputs, 1)
-
-
-class InceptionB(nn.Module):
-
-    def __init__(self, in_channels):
-        super(InceptionB, self).__init__()
-        self.branch3x3 = BasicConv2d(in_channels, 384, kernel_size=3, stride=2)
-
-        self.branch3x3dbl_1 = BasicConv2d(in_channels, 64, kernel_size=1)
-        self.branch3x3dbl_2 = BasicConv2d(64, 96, kernel_size=3, padding=1)
-        self.branch3x3dbl_3 = BasicConv2d(96, 96, kernel_size=3, stride=2)
-
-    def forward(self, x):
-        branch3x3 = self.branch3x3(x)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
-
-        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
-
-        outputs = [branch3x3, branch3x3dbl, branch_pool]
-        return torch.cat(outputs, 1)
-
-
-class InceptionC(nn.Module):
-
-    def __init__(self, in_channels, channels_7x7):
-        super(InceptionC, self).__init__()
-        self.branch1x1 = BasicConv2d(in_channels, 192, kernel_size=1)
-
-        c7 = channels_7x7
-        self.branch7x7_1 = BasicConv2d(in_channels, c7, kernel_size=1)
-        self.branch7x7_2 = BasicConv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3))
-        self.branch7x7_3 = BasicConv2d(c7, 192, kernel_size=(7, 1), padding=(3, 0))
-
-        self.branch7x7dbl_1 = BasicConv2d(in_channels, c7, kernel_size=1)
-        self.branch7x7dbl_2 = BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0))
-        self.branch7x7dbl_3 = BasicConv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3))
-        self.branch7x7dbl_4 = BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0))
-        self.branch7x7dbl_5 = BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3))
-
-        self.branch_pool = BasicConv2d(in_channels, 192, kernel_size=1)
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch7x7 = self.branch7x7_1(x)
-        branch7x7 = self.branch7x7_2(branch7x7)
-        branch7x7 = self.branch7x7_3(branch7x7)
-
-        branch7x7dbl = self.branch7x7dbl_1(x)
-        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
-        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
-
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool(branch_pool)
-
-        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
-        return torch.cat(outputs, 1)
-
-
-class InceptionD(nn.Module):
-
-    def __init__(self, in_channels):
-        super(InceptionD, self).__init__()
-        self.branch3x3_1 = BasicConv2d(in_channels, 192, kernel_size=1)
-        self.branch3x3_2 = BasicConv2d(192, 320, kernel_size=3, stride=2)
-
-        self.branch7x7x3_1 = BasicConv2d(in_channels, 192, kernel_size=1)
-        self.branch7x7x3_2 = BasicConv2d(192, 192, kernel_size=(1, 7), padding=(0, 3))
-        self.branch7x7x3_3 = BasicConv2d(192, 192, kernel_size=(7, 1), padding=(3, 0))
-        self.branch7x7x3_4 = BasicConv2d(192, 192, kernel_size=3, stride=2)
-
-    def forward(self, x):
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = self.branch3x3_2(branch3x3)
-
-        branch7x7x3 = self.branch7x7x3_1(x)
-        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
-        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
-        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
-
-        branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
-        outputs = [branch3x3, branch7x7x3, branch_pool]
-        return torch.cat(outputs, 1)
-
-
-class InceptionE(nn.Module):
-
-    def __init__(self, in_channels):
-        super(InceptionE, self).__init__()
-        self.branch1x1 = BasicConv2d(in_channels, 320, kernel_size=1)
-
-        self.branch3x3_1 = BasicConv2d(in_channels, 384, kernel_size=1)
-        self.branch3x3_2a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1))
-        self.branch3x3_2b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0))
-
-        self.branch3x3dbl_1 = BasicConv2d(in_channels, 448, kernel_size=1)
-        self.branch3x3dbl_2 = BasicConv2d(448, 384, kernel_size=3, padding=1)
-        self.branch3x3dbl_3a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1))
-        self.branch3x3dbl_3b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0))
-
-        self.branch_pool = BasicConv2d(in_channels, 192, kernel_size=1)
-
-    def forward(self, x):
-        branch1x1 = self.branch1x1(x)
-
-        branch3x3 = self.branch3x3_1(x)
-        branch3x3 = [
-            self.branch3x3_2a(branch3x3),
-            self.branch3x3_2b(branch3x3),
-        ]
-        branch3x3 = torch.cat(branch3x3, 1)
-
-        branch3x3dbl = self.branch3x3dbl_1(x)
-        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
-        branch3x3dbl = [
-            self.branch3x3dbl_3a(branch3x3dbl),
-            self.branch3x3dbl_3b(branch3x3dbl),
-        ]
-        branch3x3dbl = torch.cat(branch3x3dbl, 1)
-
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
-        branch_pool = self.branch_pool(branch_pool)
-
-        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
-        return torch.cat(outputs, 1)
-
-
-class InceptionAux(nn.Module):
-
-    def __init__(self, in_channels, num_classes):
-        super(InceptionAux, self).__init__()
-        self.conv0 = BasicConv2d(in_channels, 128, kernel_size=1)
-        self.conv1 = BasicConv2d(128, 768, kernel_size=5)
-        self.conv1.stddev = 0.01
-        self.fc = nn.Linear(768, num_classes)
-        self.fc.stddev = 0.001
-
-    def forward(self, x):
-        # 17 x 17 x 768
-        x = F.avg_pool2d(x, kernel_size=5, stride=3)
-        # 5 x 5 x 768
-        x = self.conv0(x)
-        # 5 x 5 x 128
-        x = self.conv1(x)
-        # 1 x 1 x 768
-        x = x.view(x.size(0), -1)
-        # 768
-        x = self.fc(x)
-        # 1000
-        return x
-
-
-class BasicConv2d(nn.Module):
-
-    def __init__(self, in_channels, out_channels, **kwargs):
-        super(BasicConv2d, self).__init__()
-        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
-        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        return F.relu(x, inplace=True)
diff --git a/models/modules.py b/models/modules.py
deleted file mode 100644
index f81a0a3..0000000
--- a/models/modules.py
+++ /dev/null
@@ -1,103 +0,0 @@
-'''
-Useful modules for building deep neural networks
-Copyright (c) YANG, Wei 2017
-'''
-import torch
-from torch.autograd import Variable
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-
-__all__ = ['SoftmaxAttention', 'SigmoidAttention']
-
-class SoftmaxAttention(nn.Module):
-    # implementation of Wang et al. "Residual Attention Network for Image Classification". CVPR, 2017.
-    def __init__(self, planes, residual=True, normalize=False):
-        super(SoftmaxAttention, self).__init__()
-        self.residual = residual
-        self.normalize = normalize
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv1 = nn.Conv2d(planes, planes, kernel_size=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(planes, 1, kernel_size=1, bias=False)
-        if self.normalize == True:
-            self.bn3 = nn.BatchNorm2d(1)
-        self.softmax = nn.Softmax()
-        self.mask = None
-
-    def get_mask(self):
-        return self.mask
-
-    def forward(self, x):
-        # preactivate
-        mask = self.bn1(x)
-        mask = self.relu(mask)
-        mask = self.conv1(mask)
-        mask = self.bn2(mask)
-        mask = self.relu(mask)
-        mask = self.conv2(mask)
-        # print('min: %.4f | max: %.4f' % (mask.data.min(), mask.data.max()))
-        if self.normalize == True:
-            mask = self.bn3(mask)
-        # print('min: %.4f | max: %.4f' % (mask.data.min(), mask.data.max()))
-        mask = mask.view(mask.size(0), -1)
-        mask = self.softmax(mask)
-        mask = mask.view(mask.size(0), 1, x.size(2), x.size(3))
-        const = F.max_pool2d(mask, mask.size(2))
-        mask = mask / const.expand_as(mask)
-        self.mask = mask
-
-
-        out = x * mask.expand_as(x)
-        if self.residual == True:
-            out += x
-
-        return out
-
-
-class SigmoidAttention(nn.Module):
-    # implementation of Wang et al. "Residual Attention Network for Image Classification". CVPR, 2017.
-    def __init__(self, planes, residual=True, normalize=False):
-        super(SigmoidAttention, self).__init__()
-        self.residual = residual
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv1 = nn.Conv2d(planes, planes, kernel_size=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(planes, 1, kernel_size=1, bias=False)
-        self.sigmoid = nn.Sigmoid()
-        self.mask = None
-
-    def get_mask(self):
-        return self.mask
-
-    def forward(self, x):
-        # preactivate
-        mask = self.bn1(x)
-        mask = self.relu(mask)
-        mask = self.conv1(mask)
-        mask = self.bn2(mask)
-        mask = self.relu(mask)
-        mask = self.conv2(mask)
-        mask = self.sigmoid(mask)
-        self.mask = mask
-
-
-        out = x * mask.expand_as(x)
-        if self.residual:
-            out += x
-
-        return out
-
-# # Softmax Attention 
-# model = SoftmaxAttention(3).cuda()
-# x = torch.randn(2,3,32,32)
-# out = model(Variable(x.cuda()))
-# print(out.size())
-
-# # Sigmoid Attention
-# model = SigmoidAttention(3).cuda()
-# x = torch.randn(2,3,32,32)
-# out = model(Variable(x.cuda()))
-# print(out.size())
\ No newline at end of file
diff --git a/models/preresnet.py b/models/preresnet.py
deleted file mode 100644
index 29b616c..0000000
--- a/models/preresnet.py
+++ /dev/null
@@ -1,182 +0,0 @@
-'''Pre-activated Resnet for cifar dataset. 
-Ported form https://github.com/facebook/fb.resnet.torch/blob/master/models/preresnet.lua
-(c) YANG, Wei 
-'''
-import torch.nn as nn
-import math
-import torch.utils.model_zoo as model_zoo
-
-
-__all__ = ['PreResNet', 'preresnet20', 'preresnet32', 'preresnet44', 'preresnet56',
-           'preresnet110', 'preresnet1202']
-
-def conv3x3(in_planes, out_planes, stride=1):
-    "3x3 convolution with padding"
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                     padding=1, bias=False)
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        super(BasicBlock, self).__init__()
-        self.bn1 = nn.BatchNorm2d(inplanes)
-        self.conv1 = conv3x3(inplanes, planes, stride)
-        self.relu = nn.ReLU(inplace=True)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv2 = conv3x3(planes, planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.bn1(x)
-        out = self.relu(out)
-        out = self.conv1(out)
-
-        out = self.bn2(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    expansion = 4
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        super(Bottleneck, self).__init__()
-        self.bn1 = nn.BatchNorm2d(inplanes)
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
-                               padding=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-
-        out = self.bn1(x)
-        out = self.relu(out)
-        out = self.conv1(out)
-
-        out = self.bn2(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-
-        out = self.bn3(out)
-        out = self.relu(out)
-        out = self.conv3(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out += residual
-
-        return out
-
-
-class PreResNet(nn.Module):
-
-    def __init__(self, block, layers, num_classes=1000):
-        self.inplanes = 16
-        super(PreResNet, self).__init__()
-        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1,
-                               bias=False)
-        self.layer1 = self._make_layer(block, 16, layers[0])
-        self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
-        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
-        self.bn = nn.BatchNorm2d(64*block.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.avgpool = nn.AvgPool2d(8)
-        self.fc = nn.Linear(64*block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def _make_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
-
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        x = self.conv1(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.relu(self.bn(x))
-        x = self.avgpool(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc(x)
-
-        return x
-
-
-def preresnet20(**kwargs):
-    """Constructs a PreResNet-20 model.
-    """
-    model = PreResNet(BasicBlock, [3, 3, 3], **kwargs)
-    return model
-
-
-def preresnet32(**kwargs):
-    """Constructs a PreResNet-32 model.
-    """
-    model = PreResNet(BasicBlock, [5, 5, 5], **kwargs)
-    return model
-
-
-def preresnet44(**kwargs):
-    """Constructs a PreResNet-44 model.
-    """
-    model = PreResNet(Bottleneck, [7, 7, 7], **kwargs)
-    return model
-
-
-def preresnet56(**kwargs):
-    """Constructs a PreResNet-56 model.
-    """
-    model = PreResNet(Bottleneck, [9, 9, 9], **kwargs)
-    return model
-
-
-def preresnet110(**kwargs):
-    """Constructs a PreResNet-110 model.
-    """
-    model = PreResNet(Bottleneck, [18, 18, 18], **kwargs)
-    return model
-
-def preresnet1202(**kwargs):
-    """Constructs a PreResNet-1202 model.
-    """
-    model = PreResNet(Bottleneck, [200, 200, 200], **kwargs)
-    return model
\ No newline at end of file
diff --git a/models/resadvnet.py b/models/resadvnet.py
deleted file mode 100644
index c71b89a..0000000
--- a/models/resadvnet.py
+++ /dev/null
@@ -1,268 +0,0 @@
-'''Pre-activated Resnet for cifar dataset. 
-Ported form https://github.com/facebook/fb.resnet.torch/blob/master/models/resadvnet.lua
-(c) YANG, Wei 
-'''
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-from .preresnet import BasicBlock, Bottleneck
-from .hourglass import Hourglass
-from .modules import SoftmaxAttention
-
-# __all__ = ['Attention', 'ResAdvNet', 'resadvnet20', 'resadvnet32', 'resadvnet44', 'resadvnet56',
-#            'resadvnet110', 'resadvnet1202']
-
-# class Attention(nn.Module):
-#     # implementation of Wang et al. "Residual Attention Network for Image Classification". CVPR, 2017.
-#     def __init__(self, block, p, t, r, planes, depth):
-#         super(Attention, self).__init__()
-#         self.p = p
-#         self.t = t
-#         out_planes = planes*block.expansion
-#         self.residual = block(out_planes, planes)
-#         self.hourglass = Hourglass(block, r, planes, depth)
-#         self.fc1 = nn.Conv2d(out_planes, out_planes, kernel_size=1, bias=False)
-#         self.fc2 = nn.Conv2d(out_planes, 1, kernel_size=1, bias=False)
-
-#     def get_mask(self):
-#         return self.mx
-
-#     def forward(self, x):
-#         # preprocessing
-#         for i in range(0, self.p):
-#             x = self.residual(x)
-
-#         # trunk branch
-#         tx = x
-#         for i in range(0, self.p):
-#             tx = self.residual(tx)
-
-#         # mask branch
-#         self.mx = F.sigmoid(self.fc2(self.fc1(self.hourglass(x))))
-
-#         # residual attented feature
-#         out = tx + tx*self.mx.expand_as(tx)
-
-#         return out
-
-class StackedAdversary(nn.Module):
-    def __init__(self, block, planes, num_stacks=3, residual=False, normalize=False):
-        super(StackedAdversary, self).__init__()
-        self.num_stacks = num_stacks
-        attentions = []
-        for s in range(0, self.num_stacks):
-            attentions.append(SoftmaxAttention(planes, normalize=normalize, residual=residual))
-        self.attention = nn.ModuleList(attentions)
-
-    def get_mask(self):
-        mask = []
-        for _, att in enumerate(self.attention):
-            mask.append(att.get_mask())
-        return mask
-
-    def forward(self, x):
-        out = x.clone()
-        self.mask = []
-        att_f = self.attention[0](x)
-        out += att_f
-        for s in range(1, self.num_stacks):
-            adv_f = x - att_f
-            att_f = self.attention[s](adv_f)
-            x = adv_f
-            out += att_f
-
-        return out
-
-class ResAdvNet(nn.Module):
-
-    def __init__(self, block, layers, residual=False, normalize=False, num_classes=1000):
-        self.inplanes = 16
-        super(ResAdvNet, self).__init__()
-        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1,
-                               bias=False)
-        self.layer1 = self._make_layer(block, 16, layers[0])
-        self.adv1 = StackedAdversary(block, 16 * block.expansion, 
-                            num_stacks=5, residual=False, normalize=False)
-        self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
-        self.adv2 = StackedAdversary(block, 32 * block.expansion, 
-                            num_stacks=5, residual=False, normalize=False)
-        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
-        self.adv3 = StackedAdversary(block, 64 * block.expansion, 
-                            num_stacks=5, residual=False, normalize=False)
-        self.bn = nn.BatchNorm2d(64 * block.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.avgpool = nn.AvgPool2d(8)
-        self.fc = nn.Linear(64 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def _make_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
-
-        return nn.Sequential(*layers)
-
-    def get_mask(self): # get attention mask
-        masks = []
-        # masks.append(self.adv1.get_mask())
-        masks.append(self.adv2.get_mask())
-        masks.append(self.adv3.get_mask())
-        return masks
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.layer1(x)
-        # x = self.adv1(x)
-        x = self.layer2(x)
-        x = self.adv2(x)
-        x = self.layer3(x)
-        x = self.adv3(x)
-        x = self.relu(self.bn(x))
-
-        x = self.avgpool(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc(x)
-
-        return x
-
-
-def resadvnet20(**kwargs):
-    """Constructs a ResAdvNet-20 model.
-    """
-    model = ResAdvNet(BasicBlock, [3, 3, 3], **kwargs)
-    return model
-
-
-def resadvnet32(**kwargs):
-    """Constructs a ResAdvNet-32 model.
-    """
-    model = ResAdvNet(BasicBlock, [5, 5, 5], **kwargs)
-    return model
-
-
-def resadvnet44(**kwargs):
-    """Constructs a ResAdvNet-44 model.
-    """
-    model = ResAdvNet(Bottleneck, [7, 7, 7], **kwargs)
-    return model
-
-
-def resadvnet56(**kwargs):
-    """Constructs a ResAdvNet-56 model.
-    """
-    model = ResAdvNet(Bottleneck, [9, 9, 9], **kwargs)
-    return model
-
-
-def resadvnet110(**kwargs):
-    """Constructs a ResAdvNet-110 model.
-    """
-    model = ResAdvNet(Bottleneck, [18, 18, 18], **kwargs)
-    return model
-
-def resadvnet1202(**kwargs):
-    """Constructs a ResAdvNet-1202 model.
-    """
-    model = ResAdvNet(Bottleneck, [200, 200, 200], **kwargs)
-    return model
-
-# ----------------------------
-
-def resadvbn20(**kwargs):
-    """Constructs a ResAdvNet-20 model.
-    """
-    model = ResAdvNet(BasicBlock, [3, 3, 3], normalize=True, **kwargs)
-    return model
-
-
-def resadvbn32(**kwargs):
-    """Constructs a ResAdvNet-32 model.
-    """
-    model = ResAdvNet(BasicBlock, [5, 5, 5], normalize=True, **kwargs)
-    return model
-
-
-def resadvbn44(**kwargs):
-    """Constructs a ResAdvNet-44 model.
-    """
-    model = ResAdvNet(Bottleneck, [7, 7, 7], normalize=True, **kwargs)
-    return model
-
-
-def resadvbn56(**kwargs):
-    """Constructs a ResAdvNet-56 model.
-    """
-    model = ResAdvNet(Bottleneck, [9, 9, 9], normalize=True, **kwargs)
-    return model
-
-
-def resadvbn110(**kwargs):
-    """Constructs a ResAdvNet-110 model.
-    """
-    model = ResAdvNet(Bottleneck, [18, 18, 18], normalize=True, **kwargs)
-    return model
-
-def resadvbn1202(**kwargs):
-    """Constructs a ResAdvNet-1202 model.
-    """
-    model = ResAdvNet(Bottleneck, [200, 200, 200], normalize=True, **kwargs)
-    return model
-
-# -------------------------------------
-
-def resadvbnres20(**kwargs):
-    """Constructs a ResAdvNet-20 model.
-    """
-    model = ResAdvNet(BasicBlock, [3, 3, 3], normalize=True, residual=True, **kwargs)
-    return model
-
-
-def resadvbnres32(**kwargs):
-    """Constructs a ResAdvNet-32 model.
-    """
-    model = ResAdvNet(BasicBlock, [5, 5, 5], normalize=True, residual=True, **kwargs)
-    return model
-
-
-def resadvbnres44(**kwargs):
-    """Constructs a ResAdvNet-44 model.
-    """
-    model = ResAdvNet(Bottleneck, [7, 7, 7], normalize=True, residual=True, **kwargs)
-    return model
-
-
-def resadvbnres56(**kwargs):
-    """Constructs a ResAdvNet-56 model.
-    """
-    model = ResAdvNet(Bottleneck, [9, 9, 9], normalize=True, residual=True, **kwargs)
-    return model
-
-
-def resadvbnres110(**kwargs):
-    """Constructs a ResAdvNet-110 model.
-    """
-    model = ResAdvNet(Bottleneck, [18, 18, 18], normalize=True, residual=True, **kwargs)
-    return model
-
-def resadvbnres1202(**kwargs):
-    """Constructs a ResAdvNet-1202 model.
-    """
-    model = ResAdvNet(Bottleneck, [200, 200, 200], normalize=True, residual=True, **kwargs)
-    return model
\ No newline at end of file
diff --git a/models/resattnet.py b/models/resattnet.py
deleted file mode 100644
index b4e5813..0000000
--- a/models/resattnet.py
+++ /dev/null
@@ -1,164 +0,0 @@
-'''Pre-activated Resnet for cifar dataset. 
-Ported form https://github.com/facebook/fb.resnet.torch/blob/master/models/resattnet.lua
-(c) YANG, Wei 
-'''
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-from .preresnet import BasicBlock, Bottleneck
-from .hourglass import Hourglass
-
-
-__all__ = ['Attention', 'ResAttNet', 'resattnet20', 'resattnet32', 'resattnet44', 'resattnet56',
-           'resattnet110', 'resattnet1202']
-
-class Attention(nn.Module):
-    # implementation of Wang et al. "Residual Attention Network for Image Classification". CVPR, 2017.
-    def __init__(self, block, p, t, r, planes, depth):
-        super(Attention, self).__init__()
-        self.p = p
-        self.t = t
-        out_planes = planes*block.expansion
-        self.residual = block(out_planes, planes)
-        self.hourglass = Hourglass(block, r, planes, depth)
-        self.bn = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.fc1 = nn.Sequential(
-            nn.Conv2d(out_planes, out_planes,
-                      kernel_size=1, bias=False),
-            nn.BatchNorm2d(out_planes),
-        )
-        self.fc2 = nn.Conv2d(out_planes, 1, kernel_size=1, bias=False)
-
-    def get_mask(self):
-        return self.mx
-
-    def forward(self, x):
-        # preprocessing
-        for i in range(0, self.p):
-            x = self.residual(x)
-
-        # trunk branch
-        tx = x
-        for i in range(0, self.p):
-            tx = self.residual(tx)
-
-        # mask branch
-        mx = self.relu(self.bn(self.hourglass(x)))
-        mx = self.fc1(mx)
-        mx = self.fc2(mx)
-        self.mx = F.sigmoid(mx)
-
-        # residual attented feature
-        out = tx + tx*self.mx.expand_as(tx)
-
-        return out
-
-class ResAttNet(nn.Module):
-
-    def __init__(self, block, layers, num_classes=1000):
-        self.inplanes = 16
-        super(ResAttNet, self).__init__()
-        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1,
-                               bias=False)
-        self.bn1 = nn.BatchNorm2d(16)
-        self.relu = nn.ReLU(inplace=True)
-        self.layer1 = self._make_layer(block, 16, layers[0])
-        self.att1 = Attention(block, 1, 2, 1, 16, 3)
-        self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
-        self.att2 = Attention(block, 1, 2, 1, 32, 2)
-        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
-        self.att3 = Attention(block, 1, 2, 1, 64, 2)
-        self.bn2 = nn.BatchNorm2d(64)
-        self.avgpool = nn.AvgPool2d(8)
-        self.fc = nn.Linear(64 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def _make_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
-
-        return nn.Sequential(*layers)
-
-    def get_mask(self): # get attention mask
-        masks = []
-        masks.append(self.att1.get_mask())
-        masks.append(self.att2.get_mask())
-        return masks
-
-    def forward(self, x):
-        x = self.conv1(x)
-
-        x = self.layer1(x)
-        x = self.att1(x)
-        x = self.layer2(x)
-        x = self.att2(x)
-        x = self.layer3(x)
-        x = self.att3(x)
-        x = self.bn2(x)
-        x = self.relu(x)
-
-        x = self.avgpool(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc(x)
-
-        return x
-
-
-def resattnet20(**kwargs):
-    """Constructs a ResAttNet-20 model.
-    """
-    model = ResAttNet(BasicBlock, [3, 3, 3], **kwargs)
-    return model
-
-
-def resattnet32(**kwargs):
-    """Constructs a ResAttNet-32 model.
-    """
-    model = ResAttNet(BasicBlock, [5, 5, 5], **kwargs)
-    return model
-
-
-def resattnet44(**kwargs):
-    """Constructs a ResAttNet-44 model.
-    """
-    model = ResAttNet(Bottleneck, [7, 7, 7], **kwargs)
-    return model
-
-
-def resattnet56(**kwargs):
-    """Constructs a ResAttNet-56 model.
-    """
-    model = ResAttNet(Bottleneck, [9, 9, 9], **kwargs)
-    return model
-
-
-def resattnet110(**kwargs):
-    """Constructs a ResAttNet-110 model.
-    """
-    model = ResAttNet(Bottleneck, [18, 18, 18], **kwargs)
-    return model
-
-def resattnet1202(**kwargs):
-    """Constructs a ResAttNet-1202 model.
-    """
-    model = ResAttNet(Bottleneck, [200, 200, 200], **kwargs)
-    return model
\ No newline at end of file
diff --git a/models/resnet.py b/models/resnet.py
index aaa515d..e4c550a 100644
--- a/models/resnet.py
+++ b/models/resnet.py
@@ -4,7 +4,6 @@
 '''
 import torch.nn as nn
 import math
-import torch.utils.model_zoo as model_zoo
 
 
 __all__ = ['ResNet', 'resnet20', 'resnet32', 'resnet44', 'resnet56',
diff --git a/models/resnext.py b/models/resnext.py
new file mode 100644
index 0000000..dd8908b
--- /dev/null
+++ b/models/resnext.py
@@ -0,0 +1,126 @@
+from __future__ import division
+""" 
+Creates a ResNeXt Model as defined in:
+Xie, S., Girshick, R., Dollar, P., Tu, Z., & He, K. (2016). 
+Aggregated residual transformations for deep neural networks. 
+arXiv preprint arXiv:1611.05431.
+import from https://github.com/prlz77/ResNeXt.pytorch/blob/master/models/model.py
+"""
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+
+__all__ = ['resnext']
+
+class ResNeXtBottleneck(nn.Module):
+    """
+    RexNeXt bottleneck type C (https://github.com/facebookresearch/ResNeXt/blob/master/models/resnext.lua)
+    """
+    def __init__(self, in_channels, out_channels, stride, cardinality, widen_factor):
+        """ Constructor
+        Args:
+            in_channels: input channel dimensionality
+            out_channels: output channel dimensionality
+            stride: conv stride. Replaces pooling layer.
+            cardinality: num of convolution groups.
+            widen_factor: factor to reduce the input dimensionality before convolution.
+        """
+        super(ResNeXtBottleneck, self).__init__()
+        D = cardinality * out_channels // widen_factor
+        self.conv_reduce = nn.Conv2d(in_channels, D, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn_reduce = nn.BatchNorm2d(D)
+        self.conv_conv = nn.Conv2d(D, D, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)
+        self.bn = nn.BatchNorm2d(D)
+        self.conv_expand = nn.Conv2d(D, out_channels, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn_expand = nn.BatchNorm2d(out_channels)
+
+        self.shortcut = nn.Sequential()
+        if in_channels != out_channels:
+            self.shortcut.add_module('shortcut_conv', nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, padding=0, bias=False))
+            self.shortcut.add_module('shortcut_bn', nn.BatchNorm2d(out_channels))
+
+    def forward(self, x):
+        bottleneck = self.conv_reduce.forward(x)
+        bottleneck = F.relu(self.bn_reduce.forward(bottleneck), inplace=True)
+        bottleneck = self.conv_conv.forward(bottleneck)
+        bottleneck = F.relu(self.bn.forward(bottleneck), inplace=True)
+        bottleneck = self.conv_expand.forward(bottleneck)
+        bottleneck = self.bn_expand.forward(bottleneck)
+        residual = self.shortcut.forward(x)
+        return F.relu(residual + bottleneck, inplace=True)
+
+
+class CifarResNeXt(nn.Module):
+    """
+    ResNext optimized for the Cifar dataset, as specified in
+    https://arxiv.org/pdf/1611.05431.pdf
+    """
+    def __init__(self, cardinality, depth, num_classes, widen_factor=4):
+        """ Constructor
+        Args:
+            cardinality: number of convolution groups.
+            depth: number of layers.
+            num_classes: number of classes
+            widen_factor: factor to adjust the channel dimensionality
+        """
+        super(CifarResNeXt, self).__init__()
+        self.cardinality = cardinality
+        self.depth = depth
+        self.block_depth = (self.depth - 2) // 9
+        self.widen_factor = widen_factor
+        self.num_classes = num_classes
+        self.output_size = 64
+        self.stages = [64, 64 * self.widen_factor, 128 * self.widen_factor, 256 * self.widen_factor]
+
+        self.conv_1_3x3 = nn.Conv2d(3, 64, 3, 1, 1, bias=False)
+        self.bn_1 = nn.BatchNorm2d(64)
+        self.stage_1 = self.block('stage_1', self.stages[0], self.stages[1], 1)
+        self.stage_2 = self.block('stage_2', self.stages[1], self.stages[2], 2)
+        self.stage_3 = self.block('stage_3', self.stages[2], self.stages[3], 2)
+        self.classifier = nn.Linear(1024, num_classes)
+        init.kaiming_normal(self.classifier.weight)
+
+        for key in self.state_dict():
+            if key.split('.')[-1] == 'weight':
+                if 'conv' in key:
+                    init.kaiming_normal(self.state_dict()[key], mode='fan_out')
+                if 'bn' in key:
+                    self.state_dict()[key][...] = 1
+            elif key.split('.')[-1] == 'bias':
+                self.state_dict()[key][...] = 0
+
+    def block(self, name, in_channels, out_channels, pool_stride=2):
+        """ Stack n bottleneck modules where n is inferred from the depth of the network.
+        Args:
+            name: string name of the current block.
+            in_channels: number of input channels
+            out_channels: number of output channels
+            pool_stride: factor to reduce the spatial dimensionality in the first bottleneck of the block.
+        Returns: a Module consisting of n sequential bottlenecks.
+        """
+        block = nn.Sequential()
+        for bottleneck in range(self.block_depth):
+            name_ = '%s_bottleneck_%d' % (name, bottleneck)
+            if bottleneck == 0:
+                block.add_module(name_, ResNeXtBottleneck(in_channels, out_channels, pool_stride, self.cardinality,
+                                                          self.widen_factor))
+            else:
+                block.add_module(name_,
+                                 ResNeXtBottleneck(out_channels, out_channels, 1, self.cardinality, self.widen_factor))
+        return block
+
+    def forward(self, x):
+        x = self.conv_1_3x3.forward(x)
+        x = F.relu(self.bn_1.forward(x), inplace=True)
+        x = self.stage_1.forward(x)
+        x = self.stage_2.forward(x)
+        x = self.stage_3.forward(x)
+        x = F.avg_pool2d(x, 8, 1)
+        x = x.view(-1, 1024)
+        return self.classifier(x)
+
+def resnext(**kwargs):
+    """Constructs a ResNeXt.
+    """
+    model = CifarResNeXt(**kwargs)
+    return model
\ No newline at end of file
diff --git a/models/ressoftattnet.py b/models/ressoftattnet.py
deleted file mode 100644
index 9b33be4..0000000
--- a/models/ressoftattnet.py
+++ /dev/null
@@ -1,200 +0,0 @@
-'''Pre-activated Resnet for cifar dataset. 
-Ported form https://github.com/facebook/fb.resnet.torch/blob/master/models/ressoftattnet.lua
-(c) YANG, Wei 
-
-V2: softattention
-'''
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-from .preresnet import BasicBlock, Bottleneck
-from .hourglass import Hourglass
-from .modules import *
-
-
-# __all__ = ['ResSoftAttNet', 'ressoftattnet20', 'ressoftattnet32', 'ressoftattnet44', 'ressoftattnet56',
-#            'ressoftattnet110', 'ressoftattnet1202']
-
-class ResSoftAttNet(nn.Module):
-
-    def __init__(self, block, layers, normalize=False, residual=False, num_classes=1000):
-        self.inplanes = 16
-        super(ResSoftAttNet, self).__init__()
-        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1,
-                               bias=False)
-        self.relu = nn.ReLU(inplace=True)
-        self.layer1 = self._make_layer(block, 16, layers[0])
-        self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
-        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
-        self.att = SoftmaxAttention(64 * block.expansion, normalize=normalize, residual=residual)
-        self.bn = nn.BatchNorm2d(64 * block.expansion)
-        self.avgpool = nn.AvgPool2d(8)
-        self.fc = nn.Linear(64 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def _make_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
-
-        return nn.Sequential(*layers)
-
-    def get_mask(self): # get attention mask
-        masks = []
-        masks.append(self.att.get_mask())
-        return masks
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.att(x)
-        x = self.bn(x)
-        x = self.relu(x)
-
-        x = self.avgpool(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc(x)
-
-        return x
-
-
-def ressoftattnet20(**kwargs):
-    """Constructs a ResSoftAttNet-20 model.
-    """
-    model = ResSoftAttNet(BasicBlock, [3, 3, 3], **kwargs)
-    return model
-
-
-def ressoftattnet32(**kwargs):
-    """Constructs a ResSoftAttNet-32 model.
-    """
-    model = ResSoftAttNet(BasicBlock, [5, 5, 5], **kwargs)
-    return model
-
-
-def ressoftattnet44(**kwargs):
-    """Constructs a ResSoftAttNet-44 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [7, 7, 7], **kwargs)
-    return model
-
-
-def ressoftattnet56(**kwargs):
-    """Constructs a ResSoftAttNet-56 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [9, 9, 9], **kwargs)
-    return model
-
-
-def ressoftattnet110(**kwargs):
-    """Constructs a ResSoftAttNet-110 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [18, 18, 18], **kwargs)
-    return model
-
-def ressoftattnet1202(**kwargs):
-    """Constructs a ResSoftAttNet-1202 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [200, 200, 200], **kwargs)
-    return model
-
-# --------------------------------------
-def ressoftattbn20(**kwargs):
-    """Constructs a ResSoftAttNet-20 model.
-    """
-    model = ResSoftAttNet(BasicBlock, [3, 3, 3], normalize=True, **kwargs)
-    return model
-
-
-def ressoftattbn32(**kwargs):
-    """Constructs a ResSoftAttNet-32 model.
-    """
-    model = ResSoftAttNet(BasicBlock, [5, 5, 5], normalize=True, **kwargs)
-    return model
-
-
-def ressoftattbn44(**kwargs):
-    """Constructs a ResSoftAttNet-44 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [7, 7, 7], normalize=True, **kwargs)
-    return model
-
-
-def ressoftattbn56(**kwargs):
-    """Constructs a ResSoftAttNet-56 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [9, 9, 9], normalize=True, **kwargs)
-    return model
-
-
-def ressoftattbn110(**kwargs):
-    """Constructs a ResSoftAttNet-110 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [18, 18, 18], normalize=True, **kwargs)
-    return model
-
-def ressoftattbn1202(**kwargs):
-    """Constructs a ResSoftAttNet-1202 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [200, 200, 200], normalize=True, **kwargs)
-    return model
-    
-# --------------------------------------
-def ressoftattbnres20(**kwargs):
-    """Constructs a ResSoftAttNet-20 model.
-    """
-    model = ResSoftAttNet(BasicBlock, [3, 3, 3], normalize=True, residual=True, **kwargs)
-    return model
-
-
-def ressoftattbnres32(**kwargs):
-    """Constructs a ResSoftAttNet-32 model.
-    """
-    model = ResSoftAttNet(BasicBlock, [5, 5, 5], normalize=True, residual=True, **kwargs)
-    return model
-
-
-def ressoftattbnres44(**kwargs):
-    """Constructs a ResSoftAttNet-44 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [7, 7, 7], normalize=True, residual=True, **kwargs)
-    return model
-
-
-def ressoftattbnres56(**kwargs):
-    """Constructs a ResSoftAttNet-56 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [9, 9, 9], normalize=True, residual=True, **kwargs)
-    return model
-
-
-def ressoftattbnres110(**kwargs):
-    """Constructs a ResSoftAttNet-110 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [18, 18, 18], normalize=True, residual=True, **kwargs)
-    return model
-
-def ressoftattbnres1202(**kwargs):
-    """Constructs a ResSoftAttNet-1202 model.
-    """
-    model = ResSoftAttNet(Bottleneck, [200, 200, 200], normalize=True, residual=True, **kwargs)
-    return model
\ No newline at end of file
diff --git a/models/squeezenet.py b/models/squeezenet.py
deleted file mode 100644
index ddeb7d3..0000000
--- a/models/squeezenet.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.init as init
-import torch.utils.model_zoo as model_zoo
-
-
-__all__ = ['SqueezeNet', 'squeezenet1_0', 'squeezenet1_1']
-
-
-model_urls = {
-    'squeezenet1_0': 'https://download.pytorch.org/models/squeezenet1_0-a815701f.pth',
-    'squeezenet1_1': 'https://download.pytorch.org/models/squeezenet1_1-f364aa15.pth',
-}
-
-
-class Fire(nn.Module):
-
-    def __init__(self, inplanes, squeeze_planes,
-                 expand1x1_planes, expand3x3_planes):
-        super(Fire, self).__init__()
-        self.inplanes = inplanes
-        self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
-        self.squeeze_activation = nn.ReLU(inplace=True)
-        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes,
-                                   kernel_size=1)
-        self.expand1x1_activation = nn.ReLU(inplace=True)
-        self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes,
-                                   kernel_size=3, padding=1)
-        self.expand3x3_activation = nn.ReLU(inplace=True)
-
-    def forward(self, x):
-        x = self.squeeze_activation(self.squeeze(x))
-        return torch.cat([
-            self.expand1x1_activation(self.expand1x1(x)),
-            self.expand3x3_activation(self.expand3x3(x))
-        ], 1)
-
-
-class SqueezeNet(nn.Module):
-
-    def __init__(self, version=1.0, num_classes=1000):
-        super(SqueezeNet, self).__init__()
-        if version not in [1.0, 1.1]:
-            raise ValueError("Unsupported SqueezeNet version {version}:"
-                             "1.0 or 1.1 expected".format(version=version))
-        self.num_classes = num_classes
-        if version == 1.0:
-            self.features = nn.Sequential(
-                nn.Conv2d(3, 96, kernel_size=7, stride=2),
-                nn.ReLU(inplace=True),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(96, 16, 64, 64),
-                Fire(128, 16, 64, 64),
-                Fire(128, 32, 128, 128),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(256, 32, 128, 128),
-                Fire(256, 48, 192, 192),
-                Fire(384, 48, 192, 192),
-                Fire(384, 64, 256, 256),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(512, 64, 256, 256),
-            )
-        else:
-            self.features = nn.Sequential(
-                nn.Conv2d(3, 64, kernel_size=3, stride=2),
-                nn.ReLU(inplace=True),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(64, 16, 64, 64),
-                Fire(128, 16, 64, 64),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(128, 32, 128, 128),
-                Fire(256, 32, 128, 128),
-                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
-                Fire(256, 48, 192, 192),
-                Fire(384, 48, 192, 192),
-                Fire(384, 64, 256, 256),
-                Fire(512, 64, 256, 256),
-            )
-        # Final convolution is initialized differently form the rest
-        final_conv = nn.Conv2d(512, num_classes, kernel_size=1)
-        self.classifier = nn.Sequential(
-            nn.Dropout(p=0.5),
-            final_conv,
-            nn.ReLU(inplace=True),
-            nn.AvgPool2d(13)
-        )
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                if m is final_conv:
-                    init.normal(m.weight.data, mean=0.0, std=0.01)
-                else:
-                    init.kaiming_uniform(m.weight.data)
-                if m.bias is not None:
-                    m.bias.data.zero_()
-
-    def forward(self, x):
-        x = self.features(x)
-        x = self.classifier(x)
-        return x.view(x.size(0), self.num_classes)
-
-
-def squeezenet1_0(pretrained=False, **kwargs):
-    r"""SqueezeNet model architecture from the `"SqueezeNet: AlexNet-level
-    accuracy with 50x fewer parameters and <0.5MB model size"
-    <https://arxiv.org/abs/1602.07360>`_ paper.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = SqueezeNet(version=1.0, **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['squeezenet1_0']))
-    return model
-
-
-def squeezenet1_1(pretrained=False, **kwargs):
-    r"""SqueezeNet 1.1 model from the `official SqueezeNet repo
-    <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>`_.
-    SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
-    than SqueezeNet 1.0, without sacrificing accuracy.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    model = SqueezeNet(version=1.1, **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['squeezenet1_1']))
-    return model
diff --git a/utils/__init__.py b/utils/__init__.py
index 8fb896c..848436b 100644
--- a/utils/__init__.py
+++ b/utils/__init__.py
@@ -2,4 +2,10 @@
 """
 from .misc import *
 from .logger import *
-from .visualize import *
\ No newline at end of file
+from .visualize import *
+from .eval import *
+
+# progress bar
+import os, sys
+sys.path.append(os.path.join(os.path.dirname(__file__), "progress"))
+from progress.bar import Bar as Bar
\ No newline at end of file
diff --git a/utils/eval.py b/utils/eval.py
new file mode 100644
index 0000000..5051350
--- /dev/null
+++ b/utils/eval.py
@@ -0,0 +1,18 @@
+from __future__ import print_function, absolute_import
+
+__all__ = ['accuracy']
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
\ No newline at end of file
diff --git a/utils/misc.py b/utils/misc.py
index 6c54dd9..324309c 100644
--- a/utils/misc.py
+++ b/utils/misc.py
@@ -13,8 +13,7 @@
 import torch.nn.init as init
 from torch.autograd import Variable
 
-__all__ = ['get_mean_and_std', 'init_params', 'progress_bar', 'format_time', 
-    'mkdir_p', 'upsampling']
+__all__ = ['get_mean_and_std', 'init_params', 'mkdir_p', 'AverageMeter']
 
 
 def get_mean_and_std(dataset):
@@ -47,88 +46,6 @@ def init_params(net):
             if m.bias:
                 init.constant(m.bias, 0)
 
-
-_, term_width = os.popen('stty size', 'r').read().split()
-term_width = int(term_width)
-
-TOTAL_BAR_LENGTH = 86.
-last_time = time.time()
-begin_time = last_time
-def progress_bar(current, total, msg=None):
-    global last_time, begin_time
-    if current == 0:
-        begin_time = time.time()  # Reset for new bar.
-
-    cur_len = int(TOTAL_BAR_LENGTH*current/total)
-    rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1
-
-    sys.stdout.write(' [')
-    for i in range(cur_len):
-        sys.stdout.write('=')
-    sys.stdout.write('>')
-    for i in range(rest_len):
-        sys.stdout.write('.')
-    sys.stdout.write(']')
-
-    cur_time = time.time()
-    step_time = cur_time - last_time
-    last_time = cur_time
-    tot_time = cur_time - begin_time
-
-    L = []
-    L.append('  Step: %s' % format_time(step_time))
-    L.append(' | Tot: %s' % format_time(tot_time))
-    if msg:
-        L.append(' | ' + msg)
-
-    msg = ''.join(L)
-    sys.stdout.write(msg)
-    for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3):
-        sys.stdout.write(' ')
-
-    # Go back to the center of the bar.
-    for i in range(term_width-int(TOTAL_BAR_LENGTH/2)):
-        sys.stdout.write('\b')
-    sys.stdout.write(' %d/%d ' % (current+1, total))
-
-    if current < total-1:
-        sys.stdout.write('\r')
-    else:
-        sys.stdout.write('\n')
-    sys.stdout.flush()
-
-def format_time(seconds):
-    days = int(seconds / 3600/24)
-    seconds = seconds - days*3600*24
-    hours = int(seconds / 3600)
-    seconds = seconds - hours*3600
-    minutes = int(seconds / 60)
-    seconds = seconds - minutes*60
-    secondsf = int(seconds)
-    seconds = seconds - secondsf
-    millis = int(seconds*1000)
-
-    f = ''
-    i = 1
-    if days > 0:
-        f += str(days) + 'D'
-        i += 1
-    if hours > 0 and i <= 2:
-        f += str(hours) + 'h'
-        i += 1
-    if minutes > 0 and i <= 2:
-        f += str(minutes) + 'm'
-        i += 1
-    if secondsf > 0 and i <= 2:
-        f += str(secondsf) + 's'
-        i += 1
-    if millis > 0 and i <= 2:
-        f += str(millis) + 'ms'
-        i += 1
-    if f == '':
-        f = '0ms'
-    return f
-
 def mkdir_p(path):
     '''make dir if not exist'''
     try:
@@ -139,7 +56,19 @@ def mkdir_p(path):
         else:
             raise
 
-def upsampling(x, scale_factor=1):    
-    model = nn.UpsamplingBilinear2d(scale_factor=scale_factor)
-    output = model(Variable(x))
-    return output.data
\ No newline at end of file
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
\ No newline at end of file
diff --git a/utils/progress b/utils/progress
new file mode 160000
index 0000000..715a2e1
--- /dev/null
+++ b/utils/progress
@@ -0,0 +1 @@
+Subproject commit 715a2e130f14fa95c092c771813fb1c729dae333