You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I want to train my dataset with SETR-Naive. I have organized the structure of the dataset according to the ADE20K hierarchy. Then execute './tools/dist_train. sh configs/SETR/SETR-Naive_512x512_160k_ade20k_bs_16. py 1'. I copied the config file "SETR-Naive_512x512_160k_ade20k_bs_16. py" that generated in "work_dirs" to the "configs/SETR" folder and renamed it "my_SETR-Naive_512x5112_160k_ade20k_BHkidney_bs_16. py", and modified the configuration file as follows:
Then I executed './tools/dist_train. sh configs/SETR/my_SETR-Naive_512x512_160k_ade20k_bs_16. py 1' again
The error reported is as follows:
Traceback (most recent call last):
File "./tools/train.py", line 195, in
main()
File "./tools/train.py", line 191, in main
meta=meta)
File "/data/SETR/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow, cfg.total_iters)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run
iter_runner(iter_loaders[i], **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train
outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 54, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/data/SETR/mmseg/models/segmentors/base.py", line 152, in train_step
losses = self(**data_batch)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 84, in new_func
return old_func(*args, **kwargs)
File "/data/SETR/mmseg/models/segmentors/base.py", line 122, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data/SETR/mmseg/models/segmentors/encoder_decoder.py", line 158, in forward_train
gt_semantic_seg)
File "/data/SETR/mmseg/models/segmentors/encoder_decoder.py", line 102, in _decode_head_forward_train
self.train_cfg)
File "/data/SETR/mmseg/models/decode_heads/decode_head.py", line 185, in forward_train
seg_logits = self.forward(inputs)
File "/data/SETR/mmseg/models/decode_heads/vit_up_head.py", line 153, in forward
x = self.syncbn_fc_0(x)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 501, in forward
exponential_average_factor, self.eps)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py", line 2016, in batch_norm
training, momentum, eps, torch.backends.cudnn.enabled
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 261, in
main()
File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 257, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/opt/conda/bin/python', '-u', './tools/train.py', '--local_rank=0', 'configs/SETR/my_SETR_Naive_512x512_160k_ade20k_BHkidney_bs_16.py', '--launcher', 'pytorch']' returned non-zero exit status 1.
root@ipfsunion-Server:/data/SETR# Traceback (most recent call last):
File "./tools/train.py", line 195, in
main()
File "./tools/train.py", line 191, in main
meta=meta)
File "/data/SETR/mmseg/apis/train.py", line 69, in train_segmentor
model.cuda(),
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 458, in cuda
return self._apply(lambda t: t.cuda(device))
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply
module._apply(fn)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply
module._apply(fn)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply
module._apply(fn)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 376, in _apply
param_applied = fn(param)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 458, in
return self._apply(lambda t: t.cuda(device))
KeyboardInterrupt
Traceback (most recent call last):
File "./tools/train.py", line 195, in
main()
File "./tools/train.py", line 191, in main
meta=meta)
File "/data/SETR/mmseg/apis/train.py", line 72, in train_segmentor
find_unused_parameters=find_unused_parameters)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 333, in init
self.broadcast_bucket_size)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 549, in _distributed_broadcast_coalesced
dist._broadcast_coalesced(self.process_group, tensors, buffer_size)
RuntimeError: Broken pipe
My environment configuration is as follows:
sys.platform: linux
Python: 3.7.0 (default, Oct 9 2018, 10:31:47) [GCC 7.3.0]
CUDA available: True
GPU 0,1: NVIDIA RTX A6000
NVCC: Cuda compilation tools, release 10.1, V10.1.243
GCC: gcc (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
PyTorch: 1.6.0
CuDNN 7.6.3
MMCV: 1.2.7
MMCV Compiler: GCC 7.3
MMCV CUDA Compiler: 10.1
MMSegmentation: 0.6.0+
May I ask how to handle this error?
The text was updated successfully, but these errors were encountered:
I want to train my dataset with SETR-Naive. I have organized the structure of the dataset according to the ADE20K hierarchy. Then execute './tools/dist_train. sh configs/SETR/SETR-Naive_512x512_160k_ade20k_bs_16. py 1'. I copied the config file "SETR-Naive_512x512_160k_ade20k_bs_16. py" that generated in "work_dirs" to the "configs/SETR" folder and renamed it "my_SETR-Naive_512x5112_160k_ade20k_BHkidney_bs_16. py", and modified the configuration file as follows:
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='EncoderDecoder',
backbone=dict(
type='VisionTransformer',
model_name='vit_large_patch16_384',
img_size=512,
patch_size=16,
in_chans=3,
embed_dim=1024,
depth=24,
num_heads=16,
num_classes=2, # I have modified the number of all classes
drop_rate=0.0,
norm_cfg=dict(type='SyncBN', requires_grad=True),
pos_embed_interp=True,
align_corners=False),
decode_head=dict(
type='VisionTransformerUpHead',
in_channels=1024,
channels=512,
in_index=23,
img_size=512,
embed_dim=1024,
num_classes=2, # I have modified the number of all classes
norm_cfg=dict(type='SyncBN', requires_grad=True),
num_conv=2,
upsampling_method='bilinear',
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
conv3x3_conv1x1=False),
auxiliary_head=[
dict(
type='VisionTransformerUpHead',
in_channels=1024,
channels=512,
in_index=9,
img_size=512,
embed_dim=1024,
num_classes=2, # I have modified the number of all classes
norm_cfg=dict(type='SyncBN', requires_grad=True),
num_conv=2,
upsampling_method='bilinear',
align_corners=False,
conv3x3_conv1x1=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
dict(
type='VisionTransformerUpHead',
in_channels=1024,
channels=512,
in_index=14,
img_size=512,
embed_dim=1024,
num_classes=2, # I have modified the number of all classes
norm_cfg=dict(type='SyncBN', requires_grad=True),
num_conv=2,
upsampling_method='bilinear',
align_corners=False,
conv3x3_conv1x1=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
dict(
type='VisionTransformerUpHead',
in_channels=1024,
channels=512,
in_index=19,
img_size=512,
embed_dim=1024,
num_classes=2, # I have modified the number of all classes
norm_cfg=dict(type='SyncBN', requires_grad=True),
num_conv=2,
upsampling_method='bilinear',
align_corners=False,
conv3x3_conv1x1=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4))
])
train_cfg = dict()
test_cfg = dict(mode='slide', crop_size=(512, 512), stride=(341, 341))
dataset_type = 'ADE20KDataset'
data_root = '/data/SETR/data/BHMaskKidney_ideal' # I modify the dataset path
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
crop_size = (512, 512)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', reduce_zero_label=True),
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='PhotoMetricDistortion'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=1, # batch size
workers_per_gpu=0, # nums gpu
train=dict(
type='ADE20KDataset',
data_root='/data/SETR/data/BHMaskKidney_ideal',
img_dir='images/training',
ann_dir='annotations/training',
pipeline=[
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', reduce_zero_label=True),
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='PhotoMetricDistortion'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]),
val=dict(
type='ADE20KDataset',
data_root='/data/SETR/data/BHMaskKidney_ideal',
img_dir='images/validation',
ann_dir='annotations/validation',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]),
test=dict(
type='ADE20KDataset',
data_root='/data/SETR/data/BHMaskKidney_ideal',
img_dir='images/validation',
ann_dir='annotations/validation',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]))
log_config = dict(
interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True
optimizer = dict(
type='SGD',
lr=0.01,
momentum=0.9,
weight_decay=0.0,
paramwise_cfg=dict(custom_keys=dict(head=dict(lr_mult=10.0))))
optimizer_config = dict()
lr_config = dict(policy='poly', power=0.9, min_lr=0.0001, by_epoch=False)
total_iters = 160
checkpoint_config = dict(by_epoch=False, interval=50)
evaluation = dict(interval=50, metric='mIoU')
find_unused_parameters = True
work_dir = './work_dirs/my_SETR_Naive_512x512_160k_ade20k_BHkidney_bs_16'
gpu_ids = range(0, 1)
Then I executed './tools/dist_train. sh configs/SETR/my_SETR-Naive_512x512_160k_ade20k_bs_16. py 1' again
The error reported is as follows:
Traceback (most recent call last):
File "./tools/train.py", line 195, in
main()
File "./tools/train.py", line 191, in main
meta=meta)
File "/data/SETR/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow, cfg.total_iters)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run
iter_runner(iter_loaders[i], **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train
outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 54, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/data/SETR/mmseg/models/segmentors/base.py", line 152, in train_step
losses = self(**data_batch)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 84, in new_func
return old_func(*args, **kwargs)
File "/data/SETR/mmseg/models/segmentors/base.py", line 122, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data/SETR/mmseg/models/segmentors/encoder_decoder.py", line 158, in forward_train
gt_semantic_seg)
File "/data/SETR/mmseg/models/segmentors/encoder_decoder.py", line 102, in _decode_head_forward_train
self.train_cfg)
File "/data/SETR/mmseg/models/decode_heads/decode_head.py", line 185, in forward_train
seg_logits = self.forward(inputs)
File "/data/SETR/mmseg/models/decode_heads/vit_up_head.py", line 153, in forward
x = self.syncbn_fc_0(x)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 501, in forward
exponential_average_factor, self.eps)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py", line 2016, in batch_norm
training, momentum, eps, torch.backends.cudnn.enabled
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 261, in
main()
File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 257, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/opt/conda/bin/python', '-u', './tools/train.py', '--local_rank=0', 'configs/SETR/my_SETR_Naive_512x512_160k_ade20k_BHkidney_bs_16.py', '--launcher', 'pytorch']' returned non-zero exit status 1.
root@ipfsunion-Server:/data/SETR# Traceback (most recent call last):
File "./tools/train.py", line 195, in
main()
File "./tools/train.py", line 191, in main
meta=meta)
File "/data/SETR/mmseg/apis/train.py", line 69, in train_segmentor
model.cuda(),
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 458, in cuda
return self._apply(lambda t: t.cuda(device))
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply
module._apply(fn)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply
module._apply(fn)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply
module._apply(fn)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 376, in _apply
param_applied = fn(param)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 458, in
return self._apply(lambda t: t.cuda(device))
KeyboardInterrupt
Traceback (most recent call last):
File "./tools/train.py", line 195, in
main()
File "./tools/train.py", line 191, in main
meta=meta)
File "/data/SETR/mmseg/apis/train.py", line 72, in train_segmentor
find_unused_parameters=find_unused_parameters)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 333, in init
self.broadcast_bucket_size)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 549, in _distributed_broadcast_coalesced
dist._broadcast_coalesced(self.process_group, tensors, buffer_size)
RuntimeError: Broken pipe
My environment configuration is as follows:
sys.platform: linux
Python: 3.7.0 (default, Oct 9 2018, 10:31:47) [GCC 7.3.0]
CUDA available: True
GPU 0,1: NVIDIA RTX A6000
NVCC: Cuda compilation tools, release 10.1, V10.1.243
GCC: gcc (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
PyTorch: 1.6.0
CuDNN 7.6.3
MMCV: 1.2.7
MMCV Compiler: GCC 7.3
MMCV CUDA Compiler: 10.1
MMSegmentation: 0.6.0+
May I ask how to handle this error?
The text was updated successfully, but these errors were encountered: