From ce3ada052e88e459eb2bdf3af3f4831cf85a29f3 Mon Sep 17 00:00:00 2001
From: hanqi <hqer@foxmail.com>
Date: Tue, 15 Mar 2022 04:44:07 +0000
Subject: [PATCH] iclr2022

---
 config.py                                     |  20 +-
 ...dynamic_dwnet_base_patch4_window7_224.yaml |  12 +
 ...dynamic_dwnet_tiny_patch4_window7_224.yaml |  12 +
 models/build.py                               |  17 ++
 models/dwnet.py                               |  33 ++-
 models/idynamic.py                            | 274 ++++++++++++++++++
 scripts/run_dwnet_base_patch4_window7_224.sh  |   3 +-
 ...n_dynamic_dwnet_base_patch4_window7_224.sh |   3 +-
 ...i_dynamic_dwnet_base_patch4_window7_224.sh |  24 ++
 ...i_dynamic_dwnet_tiny_patch4_window7_224.sh |  24 ++
 10 files changed, 393 insertions(+), 29 deletions(-)
 create mode 100644 configs/i_dynamic_dwnet_base_patch4_window7_224.yaml
 create mode 100644 configs/i_dynamic_dwnet_tiny_patch4_window7_224.yaml
 create mode 100755 models/idynamic.py
 create mode 100644 scripts/run_i_dynamic_dwnet_base_patch4_window7_224.sh
 create mode 100644 scripts/run_i_dynamic_dwnet_tiny_patch4_window7_224.sh

diff --git a/config.py b/config.py
index d28851c..78c32e5 100644
--- a/config.py
+++ b/config.py
@@ -50,7 +50,7 @@
 # Label Smoothing
 _C.MODEL.LABEL_SMOOTHING = 0.1
 
-# Swin Transformer parameters
+# DWNet parameters
 _C.MODEL.DWNET = CN()
 _C.MODEL.DWNET.PATCH_SIZE = 4
 _C.MODEL.DWNET.IN_CHANS = 3
@@ -62,22 +62,8 @@
 _C.MODEL.DWNET.PATCH_NORM = True
 _C.MODEL.DWNET.CONV_TYPE = "v1"
 _C.MODEL.DWNET.DYNAMIC = False
-
-# halo Transformer parameters
-_C.MODEL.HALO = CN()
-_C.MODEL.HALO.PATCH_SIZE = 4
-_C.MODEL.HALO.IN_CHANS = 3
-_C.MODEL.HALO.EMBED_DIM = 96
-_C.MODEL.HALO.DEPTHS = [2, 2, 6, 2]
-_C.MODEL.HALO.NUM_HEADS = [3, 6, 12, 24]
-_C.MODEL.HALO.WINDOW_SIZE = [7, 7, 7, 7]
-_C.MODEL.HALO.HALO_SIZE = [3, 3, 3, 3]
-_C.MODEL.HALO.MLP_RATIO = 4.
-_C.MODEL.HALO.QKV_BIAS = True
-_C.MODEL.HALO.QK_SCALE = None
-_C.MODEL.HALO.APE = False
-_C.MODEL.HALO.PATCH_NORM = True
-
+_C.MODEL.DWNET.INHOMO = False
+_C.MODEL.DWNET.INHOMO_HEADS = [ 4, 8, 16, 32 ]
 
 # -----------------------------------------------------------------------------
 # Training settings
diff --git a/configs/i_dynamic_dwnet_base_patch4_window7_224.yaml b/configs/i_dynamic_dwnet_base_patch4_window7_224.yaml
new file mode 100644
index 0000000..8c8fdca
--- /dev/null
+++ b/configs/i_dynamic_dwnet_base_patch4_window7_224.yaml
@@ -0,0 +1,12 @@
+MODEL:
+  TYPE: iddwnet
+  NAME: iddwnet_base_patch4_window7_224
+  DROP_PATH_RATE: 0.5
+  DWNET:
+    EMBED_DIM: 128
+    DEPTHS: [ 2, 2, 18, 2 ]
+    WINDOW_SIZE: 7
+    DYNAMIC: True
+    INHOMO: True
+    INHOMO_HEADS: [ 4, 8, 16, 32 ]
+AMP_OPT_LEVEL: "O0"
diff --git a/configs/i_dynamic_dwnet_tiny_patch4_window7_224.yaml b/configs/i_dynamic_dwnet_tiny_patch4_window7_224.yaml
new file mode 100644
index 0000000..12e0fb5
--- /dev/null
+++ b/configs/i_dynamic_dwnet_tiny_patch4_window7_224.yaml
@@ -0,0 +1,12 @@
+MODEL:
+  TYPE: iddwnet
+  NAME: iddwnet_tiny_patch4_window7_224
+  DROP_PATH_RATE: 0.2
+  DWNET:
+    EMBED_DIM: 96
+    DEPTHS: [ 2, 2, 6, 2 ]
+    WINDOW_SIZE: 7
+    DYNAMIC: True
+    INHOMO: True
+    INHOMO_HEADS: [ 3, 6, 12, 24 ]
+AMP_OPT_LEVEL: "O0"
diff --git a/models/build.py b/models/build.py
index d7942ab..06dad2c 100644
--- a/models/build.py
+++ b/models/build.py
@@ -32,6 +32,23 @@ def build_model(config):
                         patch_norm=config.MODEL.DWNET.PATCH_NORM,
                         use_checkpoint=config.TRAIN.USE_CHECKPOINT,
                         dynamic=config.MODEL.DWNET.DYNAMIC)                           
+    elif model_type == 'iddwnet':
+        model = DWNet(img_size=config.DATA.IMG_SIZE,
+                        patch_size=config.MODEL.DWNET.PATCH_SIZE,
+                        in_chans=config.MODEL.DWNET.IN_CHANS,
+                        num_classes=config.MODEL.NUM_CLASSES,
+                        embed_dim=config.MODEL.DWNET.EMBED_DIM,
+                        depths=config.MODEL.DWNET.DEPTHS,
+                        window_size=config.MODEL.DWNET.WINDOW_SIZE,
+                        mlp_ratio=config.MODEL.DWNET.MLP_RATIO,
+                        drop_rate=config.MODEL.DROP_RATE,
+                        drop_path_rate=config.MODEL.DROP_PATH_RATE,
+                        ape=config.MODEL.DWNET.APE,
+                        patch_norm=config.MODEL.DWNET.PATCH_NORM,
+                        use_checkpoint=config.TRAIN.USE_CHECKPOINT,
+                        dynamic=config.MODEL.DWNET.DYNAMIC,
+                        inhomogeneous=config.MODEL.DWNET.INHOMO,
+                        inhomo_heads=config.MODEL.DWNET.INHOMO_HEADS)
     else:
         raise NotImplementedError(f"Unkown model: {model_type}")
 
diff --git a/models/dwnet.py b/models/dwnet.py
index dedb438..4179d1d 100644
--- a/models/dwnet.py
+++ b/models/dwnet.py
@@ -3,6 +3,7 @@
 import torch.utils.checkpoint as checkpoint
 import torch.nn.functional as F
 from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from .idynamic import IDynamicDWConv
 
 
 class Mlp(nn.Module):
@@ -55,18 +56,23 @@ def forward(self, x):
 
 class DWBlock(nn.Module):
 
-    def __init__(self, dim, window_size, dynamic=False):
+    def __init__(self, dim, window_size, dynamic=False, inhomogeneous=False, heads=None):
         super().__init__()
         self.dim = dim
         self.window_size = window_size  # Wh, Ww
         self.dynamic = dynamic 
+        self.inhomogeneous = inhomogeneous
+        self.heads = heads
         
         # pw-linear
         self.conv0 = nn.Conv2d(dim, dim, 1, bias=False)
         self.bn0 = nn.BatchNorm2d(dim)
         
-        if dynamic:
+        if dynamic and not inhomogeneous:
             self.conv = DynamicDWConv(dim, kernel_size=window_size, stride=1, padding=window_size // 2, groups=dim)
+        if dynamic and inhomogeneous:
+            print(window_size, heads)
+            self.conv = IDynamicDWConv(dim, window_size, heads)
         else :
             self.conv = nn.Conv2d(dim, dim, kernel_size=window_size, stride=1, padding=window_size // 2, groups=dim)
         
@@ -80,7 +86,7 @@ def __init__(self, dim, window_size, dynamic=False):
     def forward(self, x):
         B, H, W, C = x.shape
         
-        x = x.permute(0, 3, 1, 2)
+        x = x.permute(0, 3, 1, 2).contiguous()
         x = self.conv0(x)
         x = self.bn0(x)
         x = self.relu(x)
@@ -92,7 +98,7 @@ def forward(self, x):
         x = self.conv2(x)
         x=self.bn2(x)
         
-        x = x.permute(0, 2, 3, 1)
+        x = x.permute(0, 2, 3, 1).contiguous()
         return x
 
     def extra_repr(self) -> str:
@@ -104,8 +110,10 @@ def flops(self, N):
         # x = self.conv0(x)
         flops += N * self.dim * self.dim
         # x = self.conv(x)
-        if self.dynamic:
+        if self.dynamic and not self.inhomogeneous:
             flops += (N * self.dim + self.dim * self.dim / 4 + self.dim / 4 * self.dim * self.window_size * self.window_size)
+        elif self.dynamic and self.inhomogeneous:
+            flops += (N * self.dim * self.dim / 4 + N * self.dim / 4 * self.dim / self.heads * self.window_size * self.window_size)
         flops +=  N * self.dim * self.window_size * self.window_size
         #  x = self.conv2(x)
         flops += N * self.dim * self.dim
@@ -117,7 +125,7 @@ def flops(self, N):
 class SpatialBlock(nn.Module):
 
     def __init__(self, dim, input_resolution, window_size=7,
-                 mlp_ratio=4.,  drop=0.,  drop_path=0., dynamic=False, act_layer=nn.GELU):
+                 mlp_ratio=4.,  drop=0.,  drop_path=0., dynamic=False, inhomogeneous=False, inhomo_head=None, act_layer=nn.GELU):
         super().__init__()
         self.dim = dim
         self.input_resolution = input_resolution
@@ -125,7 +133,7 @@ def __init__(self, dim, input_resolution, window_size=7,
         self.mlp_ratio = mlp_ratio
         self.dynamic = dynamic
 
-        self.attn2conv = DWBlock(dim, window_size, dynamic)
+        self.attn2conv = DWBlock(dim, window_size, dynamic, inhomogeneous, inhomo_head)
 
         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 
@@ -211,7 +219,7 @@ class BasicLayer(nn.Module):
 
     def __init__(self, dim, input_resolution, depth, window_size,
                  mlp_ratio=4., drop=0., drop_path=0., norm_layer=nn.LayerNorm, 
-                 downsample=None, use_checkpoint=False, dynamic=False):
+                 downsample=None, use_checkpoint=False, dynamic=False, inhomogeneous=False, inhomo_head=None):
 
         super().__init__()
         self.dim = dim
@@ -226,6 +234,8 @@ def __init__(self, dim, input_resolution, depth, window_size,
                                  mlp_ratio=mlp_ratio,
                                  drop=drop,
                                  dynamic=dynamic,
+                                 inhomogeneous=inhomogeneous,
+                                 inhomo_head=inhomo_head,
                                  drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path)
             for i in range(depth)])
 
@@ -301,7 +311,7 @@ class DWNet(nn.Module):
     def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
                  embed_dim=96, depths=[2, 2, 6, 2], window_size=7, mlp_ratio=4.,
                  drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm, 
-                 ape=False, patch_norm=True, use_checkpoint=False, dynamic=False, **kwargs):
+                 ape=False, patch_norm=True, use_checkpoint=False, dynamic=False, inhomogeneous=False, inhomo_heads=None, **kwargs):
         super().__init__()
 
         self.num_classes = num_classes
@@ -344,7 +354,10 @@ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
                                norm_layer=norm_layer,
                                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
                                use_checkpoint=use_checkpoint,
-                               dynamic=dynamic)
+                               dynamic=dynamic,
+                               inhomogeneous=inhomogeneous,
+                               inhomo_head=inhomo_heads[i_layer])
+                         
             self.layers.append(layer)
 
         self.norm = norm_layer(self.num_features)
diff --git a/models/idynamic.py b/models/idynamic.py
new file mode 100755
index 0000000..1b4cc60
--- /dev/null
+++ b/models/idynamic.py
@@ -0,0 +1,274 @@
+from torch.autograd import Function
+import torch
+from torch.nn.modules.utils import _pair
+import torch.nn.functional as F
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+
+from collections import namedtuple
+import cupy
+from string import Template
+
+
+Stream = namedtuple('Stream', ['ptr'])
+
+
+def Dtype(t):
+    if isinstance(t, torch.cuda.FloatTensor):
+        return 'float'
+    elif isinstance(t, torch.cuda.DoubleTensor):
+        return 'double'
+
+
+@cupy._util.memoize(for_each_device=True)
+def load_kernel(kernel_name, code, **kwargs):
+    code = Template(code).substitute(**kwargs)
+    kernel_code = cupy.cuda.compile_with_cache(code)
+    return kernel_code.get_function(kernel_name)
+
+
+CUDA_NUM_THREADS = 1024
+
+kernel_loop = '''
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+      i < (n);                                       \
+      i += blockDim.x * gridDim.x)
+'''
+
+
+def GET_BLOCKS(N):
+    return (N + CUDA_NUM_THREADS - 1) // CUDA_NUM_THREADS
+
+
+_idynamic_kernel = kernel_loop + '''
+extern "C"
+__global__ void idynamic_forward_kernel(
+const ${Dtype}* bottom_data, const ${Dtype}* weight_data, ${Dtype}* top_data) {
+  CUDA_KERNEL_LOOP(index, ${nthreads}) {
+    const int n = index / ${channels} / ${top_height} / ${top_width};
+    const int c = (index / ${top_height} / ${top_width}) % ${channels};
+    const int h = (index / ${top_width}) % ${top_height};
+    const int w = index % ${top_width};
+    const int g = c / (${channels} / ${groups});
+    ${Dtype} value = 0;
+    #pragma unroll
+    for (int kh = 0; kh < ${kernel_h}; ++kh) {
+      #pragma unroll
+      for (int kw = 0; kw < ${kernel_w}; ++kw) {
+        const int h_in = -${pad_h} + h * ${stride_h} + kh * ${dilation_h};
+        const int w_in = -${pad_w} + w * ${stride_w} + kw * ${dilation_w};
+        if ((h_in >= 0) && (h_in < ${bottom_height})
+          && (w_in >= 0) && (w_in < ${bottom_width})) {
+          const int offset = ((n * ${channels} + c) * ${bottom_height} + h_in)
+            * ${bottom_width} + w_in;
+          const int offset_weight = ((((n * ${groups} + g) * ${kernel_h} + kh) * ${kernel_w} + kw) * ${top_height} + h)
+            * ${top_width} + w;
+          value += weight_data[offset_weight] * bottom_data[offset];
+        }
+      }
+    }
+    top_data[index] = value;
+  }
+}
+'''
+
+
+_idynamic_kernel_backward_grad_input = kernel_loop + '''
+extern "C"
+__global__ void idynamic_backward_grad_input_kernel(
+    const ${Dtype}* const top_diff, const ${Dtype}* const weight_data, ${Dtype}* const bottom_diff) {
+  CUDA_KERNEL_LOOP(index, ${nthreads}) {
+    const int n = index / ${channels} / ${bottom_height} / ${bottom_width};
+    const int c = (index / ${bottom_height} / ${bottom_width}) % ${channels};
+    const int h = (index / ${bottom_width}) % ${bottom_height};
+    const int w = index % ${bottom_width};
+    const int g = c / (${channels} / ${groups});
+    ${Dtype} value = 0;
+    #pragma unroll
+    for (int kh = 0; kh < ${kernel_h}; ++kh) {
+      #pragma unroll
+      for (int kw = 0; kw < ${kernel_w}; ++kw) {
+        const int h_out_s = h + ${pad_h} - kh * ${dilation_h};
+        const int w_out_s = w + ${pad_w} - kw * ${dilation_w};
+        if (((h_out_s % ${stride_h}) == 0) && ((w_out_s % ${stride_w}) == 0)) {
+          const int h_out = h_out_s / ${stride_h};
+          const int w_out = w_out_s / ${stride_w};
+          if ((h_out >= 0) && (h_out < ${top_height})
+                && (w_out >= 0) && (w_out < ${top_width})) {
+            const int offset = ((n * ${channels} + c) * ${top_height} + h_out)
+                  * ${top_width} + w_out;
+            const int offset_weight = ((((n * ${groups} + g) * ${kernel_h} + kh) * ${kernel_w} + kw) * ${top_height} + h_out)
+                  * ${top_width} + w_out;
+            value += weight_data[offset_weight] * top_diff[offset];
+          }
+        }
+      }
+    }
+    bottom_diff[index] = value;
+  }
+}
+'''
+
+
+_idynamic_kernel_backward_grad_weight = kernel_loop + '''
+extern "C"
+__global__ void idynamic_backward_grad_weight_kernel(
+    const ${Dtype}* const top_diff, const ${Dtype}* const bottom_data, ${Dtype}* const buffer_data) {
+  CUDA_KERNEL_LOOP(index, ${nthreads}) {
+    const int h = (index / ${top_width}) % ${top_height};
+    const int w = index % ${top_width};
+    const int kh = (index / ${kernel_w} / ${top_height} / ${top_width})
+          % ${kernel_h};
+    const int kw = (index / ${top_height} / ${top_width}) % ${kernel_w};
+    const int h_in = -${pad_h} + h * ${stride_h} + kh * ${dilation_h};
+    const int w_in = -${pad_w} + w * ${stride_w} + kw * ${dilation_w};
+    if ((h_in >= 0) && (h_in < ${bottom_height})
+          && (w_in >= 0) && (w_in < ${bottom_width})) {
+      const int g = (index / ${kernel_h} / ${kernel_w} / ${top_height} / ${top_width}) % ${groups};
+      const int n = (index / ${groups} / ${kernel_h} / ${kernel_w} / ${top_height} / ${top_width}) % ${num};
+      ${Dtype} value = 0;
+      #pragma unroll
+      for (int c = g * (${channels} / ${groups}); c < (g + 1) * (${channels} / ${groups}); ++c) {
+        const int top_offset = ((n * ${channels} + c) * ${top_height} + h)
+              * ${top_width} + w;
+        const int bottom_offset = ((n * ${channels} + c) * ${bottom_height} + h_in)
+              * ${bottom_width} + w_in;
+        value += top_diff[top_offset] * bottom_data[bottom_offset];
+      }
+      buffer_data[index] = value;
+    } else {
+      buffer_data[index] = 0;
+    }
+  }
+}
+'''
+
+
+class _idynamic(Function):
+    @staticmethod
+    def forward(ctx, input, weight, stride, padding, dilation):
+        assert input.dim() == 4 and input.is_cuda
+        assert weight.dim() == 6 and weight.is_cuda
+        batch_size, channels, height, width = input.size()
+        kernel_h, kernel_w = weight.size()[2:4]
+        output_h = int((height + 2 * padding[0] - (dilation[0] * (kernel_h - 1) + 1)) / stride[0] + 1)
+        output_w = int((width + 2 * padding[1] - (dilation[1] * (kernel_w - 1) + 1)) / stride[1] + 1)
+
+        output = input.new(batch_size, channels, output_h, output_w)
+        n = output.numel()
+
+        with torch.cuda.device_of(input):
+            f = load_kernel('idynamic_forward_kernel', _idynamic_kernel, Dtype=Dtype(input), nthreads=n,
+                            num=batch_size, channels=channels, groups=weight.size()[1],
+                            bottom_height=height, bottom_width=width,
+                            top_height=output_h, top_width=output_w,
+                            kernel_h=kernel_h, kernel_w=kernel_w,
+                            stride_h=stride[0], stride_w=stride[1],
+                            dilation_h=dilation[0], dilation_w=dilation[1],
+                            pad_h=padding[0], pad_w=padding[1])
+            f(block=(CUDA_NUM_THREADS,1,1),
+              grid=(GET_BLOCKS(n),1,1),
+              args=[input.data_ptr(), weight.data_ptr(), output.data_ptr()],
+              stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
+
+        ctx.save_for_backward(input, weight)
+        ctx.stride, ctx.padding, ctx.dilation = stride, padding, dilation
+        return output
+    
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.is_cuda
+        if not grad_output.is_contiguous():
+          grad_output.contiguous()
+        input, weight = ctx.saved_tensors
+        stride, padding, dilation = ctx.stride, ctx.padding, ctx.dilation
+
+        batch_size, channels, height, width = input.size()
+        kernel_h, kernel_w = weight.size()[2:4]
+        output_h, output_w = grad_output.size()[2:]
+
+        grad_input, grad_weight = None, None
+
+        opt = dict(Dtype=Dtype(grad_output),
+                   num=batch_size, channels=channels, groups=weight.size()[1],
+                   bottom_height=height, bottom_width=width,
+                   top_height=output_h, top_width=output_w,
+                   kernel_h=kernel_h, kernel_w=kernel_w,
+                   stride_h=stride[0], stride_w=stride[1],
+                   dilation_h=dilation[0], dilation_w=dilation[1],
+                   pad_h=padding[0], pad_w=padding[1])
+
+        with torch.cuda.device_of(input):
+            if ctx.needs_input_grad[0]:
+                grad_input = input.new(input.size())
+
+                n = grad_input.numel()
+                opt['nthreads'] = n
+
+                f = load_kernel('idynamic_backward_grad_input_kernel',
+                                _idynamic_kernel_backward_grad_input, **opt)
+                f(block=(CUDA_NUM_THREADS,1,1),
+                  grid=(GET_BLOCKS(n),1,1),
+                  args=[grad_output.data_ptr(), weight.data_ptr(), grad_input.data_ptr()],
+                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
+
+            if ctx.needs_input_grad[1]:
+                grad_weight = weight.new(weight.size())
+
+                n = grad_weight.numel()
+                opt['nthreads'] = n
+
+                f = load_kernel('idynamic_backward_grad_weight_kernel',
+                                _idynamic_kernel_backward_grad_weight, **opt)
+                f(block=(CUDA_NUM_THREADS,1,1),
+                  grid=(GET_BLOCKS(n),1,1),
+                  args=[grad_output.data_ptr(), input.data_ptr(), grad_weight.data_ptr()],
+                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
+
+        return grad_input, grad_weight, None, None, None
+ 
+
+def _idynamic_cuda(input, weight, bias=None, stride=1, padding=0, dilation=1):
+    """ idynamic kernel
+    """
+    assert input.size(0) == weight.size(0)
+    assert input.size(-2)//stride == weight.size(-2)
+    assert input.size(-1)//stride == weight.size(-1)
+    if input.is_cuda:
+        out = _idynamic.apply(input, weight, _pair(stride), _pair(padding), _pair(dilation))
+        if bias is not None:
+            out += bias.view(1,-1,1,1)
+    else:
+        raise NotImplementedError
+    return out
+
+
+class IDynamicDWConv(nn.Module):
+
+    def __init__(self,
+                 channels,
+                 kernel_size,
+                 group_channels):
+        super(IDynamicDWConv, self).__init__()
+        self.kernel_size = kernel_size
+        self.channels = channels
+        reduction_ratio = 4
+        self.group_channels = group_channels
+        self.groups = self.channels // self.group_channels
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(channels, channels // reduction_ratio, 1),
+            nn.BatchNorm2d(channels // reduction_ratio),
+            nn.ReLU()
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(channels // reduction_ratio, kernel_size**2 * self.groups, 1)
+        )
+
+    def forward(self, x):
+        weight = self.conv2(self.conv1(x))
+        b, c, h, w = weight.shape
+        weight = weight.view(b, self.groups, self.kernel_size, self.kernel_size, h, w)
+        out = _idynamic_cuda(x, weight, stride=1, padding=(self.kernel_size-1)//2)
+        return out
diff --git a/scripts/run_dwnet_base_patch4_window7_224.sh b/scripts/run_dwnet_base_patch4_window7_224.sh
index 905c850..4cadeaf 100644
--- a/scripts/run_dwnet_base_patch4_window7_224.sh
+++ b/scripts/run_dwnet_base_patch4_window7_224.sh
@@ -20,4 +20,5 @@ MKL_THREADING_LAYER=GNU python -m torch.distributed.launch \
     --output "output/dwnet_base_patch4_window7_224" \
     --data-set IMNET \
     --batch-size 64 \
-    --amp-opt-level O0
\ No newline at end of file
+    --accumulation-steps 2 \
+    --amp-opt-level O1
diff --git a/scripts/run_dynamic_dwnet_base_patch4_window7_224.sh b/scripts/run_dynamic_dwnet_base_patch4_window7_224.sh
index 871a954..241f836 100644
--- a/scripts/run_dynamic_dwnet_base_patch4_window7_224.sh
+++ b/scripts/run_dynamic_dwnet_base_patch4_window7_224.sh
@@ -20,4 +20,5 @@ MKL_THREADING_LAYER=GNU python -m torch.distributed.launch \
     --output "output/dynamic_dwnet_base_patch4_window7_224" \
     --data-set IMNET \
     --batch-size 64 \
-    --amp-opt-level O0
\ No newline at end of file
+    --accumulation-steps 2 \
+    --amp-opt-level O1
diff --git a/scripts/run_i_dynamic_dwnet_base_patch4_window7_224.sh b/scripts/run_i_dynamic_dwnet_base_patch4_window7_224.sh
new file mode 100644
index 0000000..3c2c7e4
--- /dev/null
+++ b/scripts/run_i_dynamic_dwnet_base_patch4_window7_224.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+NCCL_SOCKET_IFNAME=ib0
+
+MASTER_IP=${MASTER_IP}
+MASTER_PORT=12345
+NODE_RANK=${OMPI_COMM_WORLD_RANK} && echo NODE_RANK: ${NODE_RANK}
+PER_NODE_GPU=8 && echo PER_NODE_GPU: ${PER_NODE_GPU}
+NUM_NODE=${OMPI_COMM_WORLD_SIZE} && echo NUM_NODE: ${NUM_NODE}
+    
+MKL_THREADING_LAYER=GNU python -m torch.distributed.launch \
+    --nproc_per_node 8 \
+    --nnodes=2 \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_IP \
+    --master_port=$MASTER_PORT \
+    main.py \
+    --cfg ./configs/dynamic_dwnet_base_patch4_window7_224.yaml \
+    --data-path "/path/to/imagenet" \
+    --output "output/dynamic_dwnet_base_patch4_window7_224" \
+    --data-set IMNET \
+    --batch-size 64 \
+     --accumulation-steps 2 \
+    --amp-opt-level O0
diff --git a/scripts/run_i_dynamic_dwnet_tiny_patch4_window7_224.sh b/scripts/run_i_dynamic_dwnet_tiny_patch4_window7_224.sh
new file mode 100644
index 0000000..aa066c4
--- /dev/null
+++ b/scripts/run_i_dynamic_dwnet_tiny_patch4_window7_224.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+NCCL_SOCKET_IFNAME=ib0
+
+MASTER_IP=${MASTER_IP}
+MASTER_PORT=12345
+NODE_RANK=${OMPI_COMM_WORLD_RANK} && echo NODE_RANK: ${NODE_RANK}
+PER_NODE_GPU=8 && echo PER_NODE_GPU: ${PER_NODE_GPU}
+NUM_NODE=${OMPI_COMM_WORLD_SIZE} && echo NUM_NODE: ${NUM_NODE}
+    
+MKL_THREADING_LAYER=GNU python -m torch.distributed.launch \
+    --nproc_per_node 8 \
+    --nnodes=1 \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_IP \
+    --master_port=$MASTER_PORT \
+    main.py \
+    --cfg ./configs/i_dynamic_dwnet_tiny_patch4_window7_224.yaml \
+    --data-path "/path/to/imagenet" \
+    --output "output/dynamic_dwnet_tiny_patch4_window7_224" \
+    --data-set IMNET \
+    --batch-size 64 \
+     --accumulation-steps 2 \
+    --amp-opt-level O0