Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED #57

Open
yl5y18 opened this issue Apr 4, 2023 · 0 comments
Open

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED #57

yl5y18 opened this issue Apr 4, 2023 · 0 comments

Comments

@yl5y18
Copy link

yl5y18 commented Apr 4, 2023

I want to train my dataset with SETR-Naive. I have organized the structure of the dataset according to the ADE20K hierarchy. Then execute './tools/dist_train. sh configs/SETR/SETR-Naive_512x512_160k_ade20k_bs_16. py 1'. I copied the config file "SETR-Naive_512x512_160k_ade20k_bs_16. py" that generated in "work_dirs" to the "configs/SETR" folder and renamed it "my_SETR-Naive_512x5112_160k_ade20k_BHkidney_bs_16. py", and modified the configuration file as follows:

norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='EncoderDecoder',
backbone=dict(
type='VisionTransformer',
model_name='vit_large_patch16_384',
img_size=512,
patch_size=16,
in_chans=3,
embed_dim=1024,
depth=24,
num_heads=16,
num_classes=2, # I have modified the number of all classes
drop_rate=0.0,
norm_cfg=dict(type='SyncBN', requires_grad=True),
pos_embed_interp=True,
align_corners=False),
decode_head=dict(
type='VisionTransformerUpHead',
in_channels=1024,
channels=512,
in_index=23,
img_size=512,
embed_dim=1024,
num_classes=2, # I have modified the number of all classes
norm_cfg=dict(type='SyncBN', requires_grad=True),
num_conv=2,
upsampling_method='bilinear',
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
conv3x3_conv1x1=False),
auxiliary_head=[
dict(
type='VisionTransformerUpHead',
in_channels=1024,
channels=512,
in_index=9,
img_size=512,
embed_dim=1024,
num_classes=2, # I have modified the number of all classes
norm_cfg=dict(type='SyncBN', requires_grad=True),
num_conv=2,
upsampling_method='bilinear',
align_corners=False,
conv3x3_conv1x1=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
dict(
type='VisionTransformerUpHead',
in_channels=1024,
channels=512,
in_index=14,
img_size=512,
embed_dim=1024,
num_classes=2, # I have modified the number of all classes
norm_cfg=dict(type='SyncBN', requires_grad=True),
num_conv=2,
upsampling_method='bilinear',
align_corners=False,
conv3x3_conv1x1=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
dict(
type='VisionTransformerUpHead',
in_channels=1024,
channels=512,
in_index=19,
img_size=512,
embed_dim=1024,
num_classes=2, # I have modified the number of all classes
norm_cfg=dict(type='SyncBN', requires_grad=True),
num_conv=2,
upsampling_method='bilinear',
align_corners=False,
conv3x3_conv1x1=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4))
])
train_cfg = dict()
test_cfg = dict(mode='slide', crop_size=(512, 512), stride=(341, 341))
dataset_type = 'ADE20KDataset'
data_root = '/data/SETR/data/BHMaskKidney_ideal' # I modify the dataset path
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
crop_size = (512, 512)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', reduce_zero_label=True),
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='PhotoMetricDistortion'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=1, # batch size
workers_per_gpu=0, # nums gpu
train=dict(
type='ADE20KDataset',
data_root='/data/SETR/data/BHMaskKidney_ideal',
img_dir='images/training',
ann_dir='annotations/training',
pipeline=[
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', reduce_zero_label=True),
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='PhotoMetricDistortion'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]),
val=dict(
type='ADE20KDataset',
data_root='/data/SETR/data/BHMaskKidney_ideal',
img_dir='images/validation',
ann_dir='annotations/validation',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]),
test=dict(
type='ADE20KDataset',
data_root='/data/SETR/data/BHMaskKidney_ideal',
img_dir='images/validation',
ann_dir='annotations/validation',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]))
log_config = dict(
interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True
optimizer = dict(
type='SGD',
lr=0.01,
momentum=0.9,
weight_decay=0.0,
paramwise_cfg=dict(custom_keys=dict(head=dict(lr_mult=10.0))))
optimizer_config = dict()
lr_config = dict(policy='poly', power=0.9, min_lr=0.0001, by_epoch=False)
total_iters = 160
checkpoint_config = dict(by_epoch=False, interval=50)
evaluation = dict(interval=50, metric='mIoU')
find_unused_parameters = True
work_dir = './work_dirs/my_SETR_Naive_512x512_160k_ade20k_BHkidney_bs_16'
gpu_ids = range(0, 1)

Then I executed './tools/dist_train. sh configs/SETR/my_SETR-Naive_512x512_160k_ade20k_bs_16. py 1' again
The error reported is as follows:
Traceback (most recent call last):
File "./tools/train.py", line 195, in
main()
File "./tools/train.py", line 191, in main
meta=meta)
File "/data/SETR/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow, cfg.total_iters)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run
iter_runner(iter_loaders[i], **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train
outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 54, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/data/SETR/mmseg/models/segmentors/base.py", line 152, in train_step
losses = self(**data_batch)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 84, in new_func
return old_func(*args, **kwargs)
File "/data/SETR/mmseg/models/segmentors/base.py", line 122, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data/SETR/mmseg/models/segmentors/encoder_decoder.py", line 158, in forward_train
gt_semantic_seg)
File "/data/SETR/mmseg/models/segmentors/encoder_decoder.py", line 102, in _decode_head_forward_train
self.train_cfg)
File "/data/SETR/mmseg/models/decode_heads/decode_head.py", line 185, in forward_train
seg_logits = self.forward(inputs)
File "/data/SETR/mmseg/models/decode_heads/vit_up_head.py", line 153, in forward
x = self.syncbn_fc_0(x)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 501, in forward
exponential_average_factor, self.eps)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py", line 2016, in batch_norm
training, momentum, eps, torch.backends.cudnn.enabled
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 261, in
main()
File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 257, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/opt/conda/bin/python', '-u', './tools/train.py', '--local_rank=0', 'configs/SETR/my_SETR_Naive_512x512_160k_ade20k_BHkidney_bs_16.py', '--launcher', 'pytorch']' returned non-zero exit status 1.
root@ipfsunion-Server:/data/SETR# Traceback (most recent call last):
File "./tools/train.py", line 195, in
main()
File "./tools/train.py", line 191, in main
meta=meta)
File "/data/SETR/mmseg/apis/train.py", line 69, in train_segmentor
model.cuda(),
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 458, in cuda
return self._apply(lambda t: t.cuda(device))
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply
module._apply(fn)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply
module._apply(fn)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply
module._apply(fn)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 376, in _apply
param_applied = fn(param)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 458, in
return self._apply(lambda t: t.cuda(device))
KeyboardInterrupt
Traceback (most recent call last):
File "./tools/train.py", line 195, in
main()
File "./tools/train.py", line 191, in main
meta=meta)
File "/data/SETR/mmseg/apis/train.py", line 72, in train_segmentor
find_unused_parameters=find_unused_parameters)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 333, in init
self.broadcast_bucket_size)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 549, in _distributed_broadcast_coalesced
dist._broadcast_coalesced(self.process_group, tensors, buffer_size)
RuntimeError: Broken pipe

My environment configuration is as follows:
sys.platform: linux
Python: 3.7.0 (default, Oct 9 2018, 10:31:47) [GCC 7.3.0]
CUDA available: True
GPU 0,1: NVIDIA RTX A6000
NVCC: Cuda compilation tools, release 10.1, V10.1.243
GCC: gcc (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
PyTorch: 1.6.0
CuDNN 7.6.3
MMCV: 1.2.7
MMCV Compiler: GCC 7.3
MMCV CUDA Compiler: 10.1
MMSegmentation: 0.6.0+

May I ask how to handle this error?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant