From ce3ada052e88e459eb2bdf3af3f4831cf85a29f3 Mon Sep 17 00:00:00 2001 From: hanqi Date: Tue, 15 Mar 2022 04:44:07 +0000 Subject: [PATCH] iclr2022 --- config.py | 20 +- ...dynamic_dwnet_base_patch4_window7_224.yaml | 12 + ...dynamic_dwnet_tiny_patch4_window7_224.yaml | 12 + models/build.py | 17 ++ models/dwnet.py | 33 ++- models/idynamic.py | 274 ++++++++++++++++++ scripts/run_dwnet_base_patch4_window7_224.sh | 3 +- ...n_dynamic_dwnet_base_patch4_window7_224.sh | 3 +- ...i_dynamic_dwnet_base_patch4_window7_224.sh | 24 ++ ...i_dynamic_dwnet_tiny_patch4_window7_224.sh | 24 ++ 10 files changed, 393 insertions(+), 29 deletions(-) create mode 100644 configs/i_dynamic_dwnet_base_patch4_window7_224.yaml create mode 100644 configs/i_dynamic_dwnet_tiny_patch4_window7_224.yaml create mode 100755 models/idynamic.py create mode 100644 scripts/run_i_dynamic_dwnet_base_patch4_window7_224.sh create mode 100644 scripts/run_i_dynamic_dwnet_tiny_patch4_window7_224.sh diff --git a/config.py b/config.py index d28851c..78c32e5 100644 --- a/config.py +++ b/config.py @@ -50,7 +50,7 @@ # Label Smoothing _C.MODEL.LABEL_SMOOTHING = 0.1 -# Swin Transformer parameters +# DWNet parameters _C.MODEL.DWNET = CN() _C.MODEL.DWNET.PATCH_SIZE = 4 _C.MODEL.DWNET.IN_CHANS = 3 @@ -62,22 +62,8 @@ _C.MODEL.DWNET.PATCH_NORM = True _C.MODEL.DWNET.CONV_TYPE = "v1" _C.MODEL.DWNET.DYNAMIC = False - -# halo Transformer parameters -_C.MODEL.HALO = CN() -_C.MODEL.HALO.PATCH_SIZE = 4 -_C.MODEL.HALO.IN_CHANS = 3 -_C.MODEL.HALO.EMBED_DIM = 96 -_C.MODEL.HALO.DEPTHS = [2, 2, 6, 2] -_C.MODEL.HALO.NUM_HEADS = [3, 6, 12, 24] -_C.MODEL.HALO.WINDOW_SIZE = [7, 7, 7, 7] -_C.MODEL.HALO.HALO_SIZE = [3, 3, 3, 3] -_C.MODEL.HALO.MLP_RATIO = 4. -_C.MODEL.HALO.QKV_BIAS = True -_C.MODEL.HALO.QK_SCALE = None -_C.MODEL.HALO.APE = False -_C.MODEL.HALO.PATCH_NORM = True - +_C.MODEL.DWNET.INHOMO = False +_C.MODEL.DWNET.INHOMO_HEADS = [ 4, 8, 16, 32 ] # ----------------------------------------------------------------------------- # Training settings diff --git a/configs/i_dynamic_dwnet_base_patch4_window7_224.yaml b/configs/i_dynamic_dwnet_base_patch4_window7_224.yaml new file mode 100644 index 0000000..8c8fdca --- /dev/null +++ b/configs/i_dynamic_dwnet_base_patch4_window7_224.yaml @@ -0,0 +1,12 @@ +MODEL: + TYPE: iddwnet + NAME: iddwnet_base_patch4_window7_224 + DROP_PATH_RATE: 0.5 + DWNET: + EMBED_DIM: 128 + DEPTHS: [ 2, 2, 18, 2 ] + WINDOW_SIZE: 7 + DYNAMIC: True + INHOMO: True + INHOMO_HEADS: [ 4, 8, 16, 32 ] +AMP_OPT_LEVEL: "O0" diff --git a/configs/i_dynamic_dwnet_tiny_patch4_window7_224.yaml b/configs/i_dynamic_dwnet_tiny_patch4_window7_224.yaml new file mode 100644 index 0000000..12e0fb5 --- /dev/null +++ b/configs/i_dynamic_dwnet_tiny_patch4_window7_224.yaml @@ -0,0 +1,12 @@ +MODEL: + TYPE: iddwnet + NAME: iddwnet_tiny_patch4_window7_224 + DROP_PATH_RATE: 0.2 + DWNET: + EMBED_DIM: 96 + DEPTHS: [ 2, 2, 6, 2 ] + WINDOW_SIZE: 7 + DYNAMIC: True + INHOMO: True + INHOMO_HEADS: [ 3, 6, 12, 24 ] +AMP_OPT_LEVEL: "O0" diff --git a/models/build.py b/models/build.py index d7942ab..06dad2c 100644 --- a/models/build.py +++ b/models/build.py @@ -32,6 +32,23 @@ def build_model(config): patch_norm=config.MODEL.DWNET.PATCH_NORM, use_checkpoint=config.TRAIN.USE_CHECKPOINT, dynamic=config.MODEL.DWNET.DYNAMIC) + elif model_type == 'iddwnet': + model = DWNet(img_size=config.DATA.IMG_SIZE, + patch_size=config.MODEL.DWNET.PATCH_SIZE, + in_chans=config.MODEL.DWNET.IN_CHANS, + num_classes=config.MODEL.NUM_CLASSES, + embed_dim=config.MODEL.DWNET.EMBED_DIM, + depths=config.MODEL.DWNET.DEPTHS, + window_size=config.MODEL.DWNET.WINDOW_SIZE, + mlp_ratio=config.MODEL.DWNET.MLP_RATIO, + drop_rate=config.MODEL.DROP_RATE, + drop_path_rate=config.MODEL.DROP_PATH_RATE, + ape=config.MODEL.DWNET.APE, + patch_norm=config.MODEL.DWNET.PATCH_NORM, + use_checkpoint=config.TRAIN.USE_CHECKPOINT, + dynamic=config.MODEL.DWNET.DYNAMIC, + inhomogeneous=config.MODEL.DWNET.INHOMO, + inhomo_heads=config.MODEL.DWNET.INHOMO_HEADS) else: raise NotImplementedError(f"Unkown model: {model_type}") diff --git a/models/dwnet.py b/models/dwnet.py index dedb438..4179d1d 100644 --- a/models/dwnet.py +++ b/models/dwnet.py @@ -3,6 +3,7 @@ import torch.utils.checkpoint as checkpoint import torch.nn.functional as F from timm.models.layers import DropPath, to_2tuple, trunc_normal_ +from .idynamic import IDynamicDWConv class Mlp(nn.Module): @@ -55,18 +56,23 @@ def forward(self, x): class DWBlock(nn.Module): - def __init__(self, dim, window_size, dynamic=False): + def __init__(self, dim, window_size, dynamic=False, inhomogeneous=False, heads=None): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.dynamic = dynamic + self.inhomogeneous = inhomogeneous + self.heads = heads # pw-linear self.conv0 = nn.Conv2d(dim, dim, 1, bias=False) self.bn0 = nn.BatchNorm2d(dim) - if dynamic: + if dynamic and not inhomogeneous: self.conv = DynamicDWConv(dim, kernel_size=window_size, stride=1, padding=window_size // 2, groups=dim) + if dynamic and inhomogeneous: + print(window_size, heads) + self.conv = IDynamicDWConv(dim, window_size, heads) else : self.conv = nn.Conv2d(dim, dim, kernel_size=window_size, stride=1, padding=window_size // 2, groups=dim) @@ -80,7 +86,7 @@ def __init__(self, dim, window_size, dynamic=False): def forward(self, x): B, H, W, C = x.shape - x = x.permute(0, 3, 1, 2) + x = x.permute(0, 3, 1, 2).contiguous() x = self.conv0(x) x = self.bn0(x) x = self.relu(x) @@ -92,7 +98,7 @@ def forward(self, x): x = self.conv2(x) x=self.bn2(x) - x = x.permute(0, 2, 3, 1) + x = x.permute(0, 2, 3, 1).contiguous() return x def extra_repr(self) -> str: @@ -104,8 +110,10 @@ def flops(self, N): # x = self.conv0(x) flops += N * self.dim * self.dim # x = self.conv(x) - if self.dynamic: + if self.dynamic and not self.inhomogeneous: flops += (N * self.dim + self.dim * self.dim / 4 + self.dim / 4 * self.dim * self.window_size * self.window_size) + elif self.dynamic and self.inhomogeneous: + flops += (N * self.dim * self.dim / 4 + N * self.dim / 4 * self.dim / self.heads * self.window_size * self.window_size) flops += N * self.dim * self.window_size * self.window_size # x = self.conv2(x) flops += N * self.dim * self.dim @@ -117,7 +125,7 @@ def flops(self, N): class SpatialBlock(nn.Module): def __init__(self, dim, input_resolution, window_size=7, - mlp_ratio=4., drop=0., drop_path=0., dynamic=False, act_layer=nn.GELU): + mlp_ratio=4., drop=0., drop_path=0., dynamic=False, inhomogeneous=False, inhomo_head=None, act_layer=nn.GELU): super().__init__() self.dim = dim self.input_resolution = input_resolution @@ -125,7 +133,7 @@ def __init__(self, dim, input_resolution, window_size=7, self.mlp_ratio = mlp_ratio self.dynamic = dynamic - self.attn2conv = DWBlock(dim, window_size, dynamic) + self.attn2conv = DWBlock(dim, window_size, dynamic, inhomogeneous, inhomo_head) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() @@ -211,7 +219,7 @@ class BasicLayer(nn.Module): def __init__(self, dim, input_resolution, depth, window_size, mlp_ratio=4., drop=0., drop_path=0., norm_layer=nn.LayerNorm, - downsample=None, use_checkpoint=False, dynamic=False): + downsample=None, use_checkpoint=False, dynamic=False, inhomogeneous=False, inhomo_head=None): super().__init__() self.dim = dim @@ -226,6 +234,8 @@ def __init__(self, dim, input_resolution, depth, window_size, mlp_ratio=mlp_ratio, drop=drop, dynamic=dynamic, + inhomogeneous=inhomogeneous, + inhomo_head=inhomo_head, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path) for i in range(depth)]) @@ -301,7 +311,7 @@ class DWNet(nn.Module): def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, embed_dim=96, depths=[2, 2, 6, 2], window_size=7, mlp_ratio=4., drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm, - ape=False, patch_norm=True, use_checkpoint=False, dynamic=False, **kwargs): + ape=False, patch_norm=True, use_checkpoint=False, dynamic=False, inhomogeneous=False, inhomo_heads=None, **kwargs): super().__init__() self.num_classes = num_classes @@ -344,7 +354,10 @@ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint, - dynamic=dynamic) + dynamic=dynamic, + inhomogeneous=inhomogeneous, + inhomo_head=inhomo_heads[i_layer]) + self.layers.append(layer) self.norm = norm_layer(self.num_features) diff --git a/models/idynamic.py b/models/idynamic.py new file mode 100755 index 0000000..1b4cc60 --- /dev/null +++ b/models/idynamic.py @@ -0,0 +1,274 @@ +from torch.autograd import Function +import torch +from torch.nn.modules.utils import _pair +import torch.nn.functional as F +import torch.nn as nn +from mmcv.cnn import ConvModule + + +from collections import namedtuple +import cupy +from string import Template + + +Stream = namedtuple('Stream', ['ptr']) + + +def Dtype(t): + if isinstance(t, torch.cuda.FloatTensor): + return 'float' + elif isinstance(t, torch.cuda.DoubleTensor): + return 'double' + + +@cupy._util.memoize(for_each_device=True) +def load_kernel(kernel_name, code, **kwargs): + code = Template(code).substitute(**kwargs) + kernel_code = cupy.cuda.compile_with_cache(code) + return kernel_code.get_function(kernel_name) + + +CUDA_NUM_THREADS = 1024 + +kernel_loop = ''' +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) +''' + + +def GET_BLOCKS(N): + return (N + CUDA_NUM_THREADS - 1) // CUDA_NUM_THREADS + + +_idynamic_kernel = kernel_loop + ''' +extern "C" +__global__ void idynamic_forward_kernel( +const ${Dtype}* bottom_data, const ${Dtype}* weight_data, ${Dtype}* top_data) { + CUDA_KERNEL_LOOP(index, ${nthreads}) { + const int n = index / ${channels} / ${top_height} / ${top_width}; + const int c = (index / ${top_height} / ${top_width}) % ${channels}; + const int h = (index / ${top_width}) % ${top_height}; + const int w = index % ${top_width}; + const int g = c / (${channels} / ${groups}); + ${Dtype} value = 0; + #pragma unroll + for (int kh = 0; kh < ${kernel_h}; ++kh) { + #pragma unroll + for (int kw = 0; kw < ${kernel_w}; ++kw) { + const int h_in = -${pad_h} + h * ${stride_h} + kh * ${dilation_h}; + const int w_in = -${pad_w} + w * ${stride_w} + kw * ${dilation_w}; + if ((h_in >= 0) && (h_in < ${bottom_height}) + && (w_in >= 0) && (w_in < ${bottom_width})) { + const int offset = ((n * ${channels} + c) * ${bottom_height} + h_in) + * ${bottom_width} + w_in; + const int offset_weight = ((((n * ${groups} + g) * ${kernel_h} + kh) * ${kernel_w} + kw) * ${top_height} + h) + * ${top_width} + w; + value += weight_data[offset_weight] * bottom_data[offset]; + } + } + } + top_data[index] = value; + } +} +''' + + +_idynamic_kernel_backward_grad_input = kernel_loop + ''' +extern "C" +__global__ void idynamic_backward_grad_input_kernel( + const ${Dtype}* const top_diff, const ${Dtype}* const weight_data, ${Dtype}* const bottom_diff) { + CUDA_KERNEL_LOOP(index, ${nthreads}) { + const int n = index / ${channels} / ${bottom_height} / ${bottom_width}; + const int c = (index / ${bottom_height} / ${bottom_width}) % ${channels}; + const int h = (index / ${bottom_width}) % ${bottom_height}; + const int w = index % ${bottom_width}; + const int g = c / (${channels} / ${groups}); + ${Dtype} value = 0; + #pragma unroll + for (int kh = 0; kh < ${kernel_h}; ++kh) { + #pragma unroll + for (int kw = 0; kw < ${kernel_w}; ++kw) { + const int h_out_s = h + ${pad_h} - kh * ${dilation_h}; + const int w_out_s = w + ${pad_w} - kw * ${dilation_w}; + if (((h_out_s % ${stride_h}) == 0) && ((w_out_s % ${stride_w}) == 0)) { + const int h_out = h_out_s / ${stride_h}; + const int w_out = w_out_s / ${stride_w}; + if ((h_out >= 0) && (h_out < ${top_height}) + && (w_out >= 0) && (w_out < ${top_width})) { + const int offset = ((n * ${channels} + c) * ${top_height} + h_out) + * ${top_width} + w_out; + const int offset_weight = ((((n * ${groups} + g) * ${kernel_h} + kh) * ${kernel_w} + kw) * ${top_height} + h_out) + * ${top_width} + w_out; + value += weight_data[offset_weight] * top_diff[offset]; + } + } + } + } + bottom_diff[index] = value; + } +} +''' + + +_idynamic_kernel_backward_grad_weight = kernel_loop + ''' +extern "C" +__global__ void idynamic_backward_grad_weight_kernel( + const ${Dtype}* const top_diff, const ${Dtype}* const bottom_data, ${Dtype}* const buffer_data) { + CUDA_KERNEL_LOOP(index, ${nthreads}) { + const int h = (index / ${top_width}) % ${top_height}; + const int w = index % ${top_width}; + const int kh = (index / ${kernel_w} / ${top_height} / ${top_width}) + % ${kernel_h}; + const int kw = (index / ${top_height} / ${top_width}) % ${kernel_w}; + const int h_in = -${pad_h} + h * ${stride_h} + kh * ${dilation_h}; + const int w_in = -${pad_w} + w * ${stride_w} + kw * ${dilation_w}; + if ((h_in >= 0) && (h_in < ${bottom_height}) + && (w_in >= 0) && (w_in < ${bottom_width})) { + const int g = (index / ${kernel_h} / ${kernel_w} / ${top_height} / ${top_width}) % ${groups}; + const int n = (index / ${groups} / ${kernel_h} / ${kernel_w} / ${top_height} / ${top_width}) % ${num}; + ${Dtype} value = 0; + #pragma unroll + for (int c = g * (${channels} / ${groups}); c < (g + 1) * (${channels} / ${groups}); ++c) { + const int top_offset = ((n * ${channels} + c) * ${top_height} + h) + * ${top_width} + w; + const int bottom_offset = ((n * ${channels} + c) * ${bottom_height} + h_in) + * ${bottom_width} + w_in; + value += top_diff[top_offset] * bottom_data[bottom_offset]; + } + buffer_data[index] = value; + } else { + buffer_data[index] = 0; + } + } +} +''' + + +class _idynamic(Function): + @staticmethod + def forward(ctx, input, weight, stride, padding, dilation): + assert input.dim() == 4 and input.is_cuda + assert weight.dim() == 6 and weight.is_cuda + batch_size, channels, height, width = input.size() + kernel_h, kernel_w = weight.size()[2:4] + output_h = int((height + 2 * padding[0] - (dilation[0] * (kernel_h - 1) + 1)) / stride[0] + 1) + output_w = int((width + 2 * padding[1] - (dilation[1] * (kernel_w - 1) + 1)) / stride[1] + 1) + + output = input.new(batch_size, channels, output_h, output_w) + n = output.numel() + + with torch.cuda.device_of(input): + f = load_kernel('idynamic_forward_kernel', _idynamic_kernel, Dtype=Dtype(input), nthreads=n, + num=batch_size, channels=channels, groups=weight.size()[1], + bottom_height=height, bottom_width=width, + top_height=output_h, top_width=output_w, + kernel_h=kernel_h, kernel_w=kernel_w, + stride_h=stride[0], stride_w=stride[1], + dilation_h=dilation[0], dilation_w=dilation[1], + pad_h=padding[0], pad_w=padding[1]) + f(block=(CUDA_NUM_THREADS,1,1), + grid=(GET_BLOCKS(n),1,1), + args=[input.data_ptr(), weight.data_ptr(), output.data_ptr()], + stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) + + ctx.save_for_backward(input, weight) + ctx.stride, ctx.padding, ctx.dilation = stride, padding, dilation + return output + + @staticmethod + def backward(ctx, grad_output): + assert grad_output.is_cuda + if not grad_output.is_contiguous(): + grad_output.contiguous() + input, weight = ctx.saved_tensors + stride, padding, dilation = ctx.stride, ctx.padding, ctx.dilation + + batch_size, channels, height, width = input.size() + kernel_h, kernel_w = weight.size()[2:4] + output_h, output_w = grad_output.size()[2:] + + grad_input, grad_weight = None, None + + opt = dict(Dtype=Dtype(grad_output), + num=batch_size, channels=channels, groups=weight.size()[1], + bottom_height=height, bottom_width=width, + top_height=output_h, top_width=output_w, + kernel_h=kernel_h, kernel_w=kernel_w, + stride_h=stride[0], stride_w=stride[1], + dilation_h=dilation[0], dilation_w=dilation[1], + pad_h=padding[0], pad_w=padding[1]) + + with torch.cuda.device_of(input): + if ctx.needs_input_grad[0]: + grad_input = input.new(input.size()) + + n = grad_input.numel() + opt['nthreads'] = n + + f = load_kernel('idynamic_backward_grad_input_kernel', + _idynamic_kernel_backward_grad_input, **opt) + f(block=(CUDA_NUM_THREADS,1,1), + grid=(GET_BLOCKS(n),1,1), + args=[grad_output.data_ptr(), weight.data_ptr(), grad_input.data_ptr()], + stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) + + if ctx.needs_input_grad[1]: + grad_weight = weight.new(weight.size()) + + n = grad_weight.numel() + opt['nthreads'] = n + + f = load_kernel('idynamic_backward_grad_weight_kernel', + _idynamic_kernel_backward_grad_weight, **opt) + f(block=(CUDA_NUM_THREADS,1,1), + grid=(GET_BLOCKS(n),1,1), + args=[grad_output.data_ptr(), input.data_ptr(), grad_weight.data_ptr()], + stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) + + return grad_input, grad_weight, None, None, None + + +def _idynamic_cuda(input, weight, bias=None, stride=1, padding=0, dilation=1): + """ idynamic kernel + """ + assert input.size(0) == weight.size(0) + assert input.size(-2)//stride == weight.size(-2) + assert input.size(-1)//stride == weight.size(-1) + if input.is_cuda: + out = _idynamic.apply(input, weight, _pair(stride), _pair(padding), _pair(dilation)) + if bias is not None: + out += bias.view(1,-1,1,1) + else: + raise NotImplementedError + return out + + +class IDynamicDWConv(nn.Module): + + def __init__(self, + channels, + kernel_size, + group_channels): + super(IDynamicDWConv, self).__init__() + self.kernel_size = kernel_size + self.channels = channels + reduction_ratio = 4 + self.group_channels = group_channels + self.groups = self.channels // self.group_channels + self.conv1 = nn.Sequential( + nn.Conv2d(channels, channels // reduction_ratio, 1), + nn.BatchNorm2d(channels // reduction_ratio), + nn.ReLU() + ) + self.conv2 = nn.Sequential( + nn.Conv2d(channels // reduction_ratio, kernel_size**2 * self.groups, 1) + ) + + def forward(self, x): + weight = self.conv2(self.conv1(x)) + b, c, h, w = weight.shape + weight = weight.view(b, self.groups, self.kernel_size, self.kernel_size, h, w) + out = _idynamic_cuda(x, weight, stride=1, padding=(self.kernel_size-1)//2) + return out diff --git a/scripts/run_dwnet_base_patch4_window7_224.sh b/scripts/run_dwnet_base_patch4_window7_224.sh index 905c850..4cadeaf 100644 --- a/scripts/run_dwnet_base_patch4_window7_224.sh +++ b/scripts/run_dwnet_base_patch4_window7_224.sh @@ -20,4 +20,5 @@ MKL_THREADING_LAYER=GNU python -m torch.distributed.launch \ --output "output/dwnet_base_patch4_window7_224" \ --data-set IMNET \ --batch-size 64 \ - --amp-opt-level O0 \ No newline at end of file + --accumulation-steps 2 \ + --amp-opt-level O1 diff --git a/scripts/run_dynamic_dwnet_base_patch4_window7_224.sh b/scripts/run_dynamic_dwnet_base_patch4_window7_224.sh index 871a954..241f836 100644 --- a/scripts/run_dynamic_dwnet_base_patch4_window7_224.sh +++ b/scripts/run_dynamic_dwnet_base_patch4_window7_224.sh @@ -20,4 +20,5 @@ MKL_THREADING_LAYER=GNU python -m torch.distributed.launch \ --output "output/dynamic_dwnet_base_patch4_window7_224" \ --data-set IMNET \ --batch-size 64 \ - --amp-opt-level O0 \ No newline at end of file + --accumulation-steps 2 \ + --amp-opt-level O1 diff --git a/scripts/run_i_dynamic_dwnet_base_patch4_window7_224.sh b/scripts/run_i_dynamic_dwnet_base_patch4_window7_224.sh new file mode 100644 index 0000000..3c2c7e4 --- /dev/null +++ b/scripts/run_i_dynamic_dwnet_base_patch4_window7_224.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +NCCL_SOCKET_IFNAME=ib0 + +MASTER_IP=${MASTER_IP} +MASTER_PORT=12345 +NODE_RANK=${OMPI_COMM_WORLD_RANK} && echo NODE_RANK: ${NODE_RANK} +PER_NODE_GPU=8 && echo PER_NODE_GPU: ${PER_NODE_GPU} +NUM_NODE=${OMPI_COMM_WORLD_SIZE} && echo NUM_NODE: ${NUM_NODE} + +MKL_THREADING_LAYER=GNU python -m torch.distributed.launch \ + --nproc_per_node 8 \ + --nnodes=2 \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_IP \ + --master_port=$MASTER_PORT \ + main.py \ + --cfg ./configs/dynamic_dwnet_base_patch4_window7_224.yaml \ + --data-path "/path/to/imagenet" \ + --output "output/dynamic_dwnet_base_patch4_window7_224" \ + --data-set IMNET \ + --batch-size 64 \ + --accumulation-steps 2 \ + --amp-opt-level O0 diff --git a/scripts/run_i_dynamic_dwnet_tiny_patch4_window7_224.sh b/scripts/run_i_dynamic_dwnet_tiny_patch4_window7_224.sh new file mode 100644 index 0000000..aa066c4 --- /dev/null +++ b/scripts/run_i_dynamic_dwnet_tiny_patch4_window7_224.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +NCCL_SOCKET_IFNAME=ib0 + +MASTER_IP=${MASTER_IP} +MASTER_PORT=12345 +NODE_RANK=${OMPI_COMM_WORLD_RANK} && echo NODE_RANK: ${NODE_RANK} +PER_NODE_GPU=8 && echo PER_NODE_GPU: ${PER_NODE_GPU} +NUM_NODE=${OMPI_COMM_WORLD_SIZE} && echo NUM_NODE: ${NUM_NODE} + +MKL_THREADING_LAYER=GNU python -m torch.distributed.launch \ + --nproc_per_node 8 \ + --nnodes=1 \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_IP \ + --master_port=$MASTER_PORT \ + main.py \ + --cfg ./configs/i_dynamic_dwnet_tiny_patch4_window7_224.yaml \ + --data-path "/path/to/imagenet" \ + --output "output/dynamic_dwnet_tiny_patch4_window7_224" \ + --data-set IMNET \ + --batch-size 64 \ + --accumulation-steps 2 \ + --amp-opt-level O0