diff --git a/dali/core/cuda_event_pool.cc b/dali/core/cuda_event_pool.cc index b90af023197..7fc49567f25 100644 --- a/dali/core/cuda_event_pool.cc +++ b/dali/core/cuda_event_pool.cc @@ -29,6 +29,9 @@ CUDAEventPool::CUDAEventPool(unsigned event_flags) { int num_devices = 0; CUDA_CALL(cudaGetDeviceCount(&num_devices)); dev_events_.resize(num_devices); + for (int i = 0; i < 20000; i++) { + Put(CUDAEvent::CreateWithFlags(cudaEventDisableTiming)); + } } CUDAEvent CUDAEventPool::Get(int device_id) { diff --git a/dali/operators/imgcodec/image_decoder.h b/dali/operators/imgcodec/image_decoder.h index 16a15d94be3..32571c8998e 100644 --- a/dali/operators/imgcodec/image_decoder.h +++ b/dali/operators/imgcodec/image_decoder.h @@ -572,17 +572,17 @@ class ImageDecoder : public StatelessOperator { }; }; - int nblocks = tp_->NumThreads() + 1; - if (nsamples > nblocks * 4) { + if (nsamples < 16) { + get_task(0, 1)(-1); // run all in current thread + } else { + int nblocks = std::max(tp_->NumThreads() + 1, 8); int block_idx = 0; - for (; block_idx < tp_->NumThreads(); block_idx++) { + for (; block_idx < nblocks - 1; block_idx++) { tp_->AddWork(get_task(block_idx, nblocks), -block_idx); } - tp_->RunAll(false); // start work but not wait - get_task(block_idx, nblocks)(-1); // run last block - tp_->WaitForWork(); // wait for the other threads - } else { // not worth parallelizing - get_task(0, 1)(-1); // run all in current thread + tp_->RunAll(false); // start work but not wait + get_task(block_idx, nblocks)(-1); // run last block + tp_->WaitForWork(); // wait for the other threads } output_descs[0] = {std::move(shapes), dtype_}; @@ -773,17 +773,17 @@ class ImageDecoder : public StatelessOperator { }; }; - int nblocks = tp_->NumThreads() + 1; - if (decode_nsamples > nblocks * 4) { + if (decode_nsamples < 16) { + get_task(0, 1)(-1); // run all in current thread + } else { + int nblocks = std::max(tp_->NumThreads() + 1, 8); int block_idx = 0; - for (; block_idx < tp_->NumThreads(); block_idx++) { + for (; block_idx < nblocks - 1; block_idx++) { tp_->AddWork(get_task(block_idx, nblocks), -block_idx); } tp_->RunAll(false); // start work but not wait get_task(block_idx, nblocks)(-1); // run last block tp_->WaitForWork(); // wait for the other threads - } else { // not worth parallelizing - get_task(0, 1)(-1); // run all in current thread } for (int orig_idx : decode_sample_idxs_) {