diff --git a/dali/core/cuda_event_pool.cc b/dali/core/cuda_event_pool.cc
index b90af023197..7fc49567f25 100644
--- a/dali/core/cuda_event_pool.cc
+++ b/dali/core/cuda_event_pool.cc
@@ -29,6 +29,9 @@ CUDAEventPool::CUDAEventPool(unsigned event_flags) {
   int num_devices = 0;
   CUDA_CALL(cudaGetDeviceCount(&num_devices));
   dev_events_.resize(num_devices);
+  for (int i = 0; i < 20000; i++) {
+    Put(CUDAEvent::CreateWithFlags(cudaEventDisableTiming));
+  }
 }
 
 CUDAEvent CUDAEventPool::Get(int device_id) {
diff --git a/dali/operators/imgcodec/image_decoder.h b/dali/operators/imgcodec/image_decoder.h
index 16a15d94be3..32571c8998e 100644
--- a/dali/operators/imgcodec/image_decoder.h
+++ b/dali/operators/imgcodec/image_decoder.h
@@ -572,17 +572,17 @@ class ImageDecoder : public StatelessOperator<Backend> {
       };
     };
 
-    int nblocks = tp_->NumThreads() + 1;
-    if (nsamples > nblocks * 4) {
+    if (nsamples < 16) {
+      get_task(0, 1)(-1);  // run all in current thread
+    } else {
+      int nblocks = std::max(tp_->NumThreads() + 1, 8);
       int block_idx = 0;
-      for (; block_idx < tp_->NumThreads(); block_idx++) {
+      for (; block_idx < nblocks - 1; block_idx++) {
         tp_->AddWork(get_task(block_idx, nblocks), -block_idx);
       }
-      tp_->RunAll(false);                // start work but not wait
-      get_task(block_idx, nblocks)(-1);  // run last block
-      tp_->WaitForWork();                // wait for the other threads
-    } else {                             // not worth parallelizing
-      get_task(0, 1)(-1);                // run all in current thread
+      tp_->RunAll(false);                 // start work but not wait
+      get_task(block_idx, nblocks)(-1);   // run last block
+      tp_->WaitForWork();                 // wait for the other threads
     }
 
     output_descs[0] = {std::move(shapes), dtype_};
@@ -773,17 +773,17 @@ class ImageDecoder : public StatelessOperator<Backend> {
         };
       };
 
-      int nblocks = tp_->NumThreads() + 1;
-      if (decode_nsamples > nblocks * 4) {
+      if (decode_nsamples < 16) {
+        get_task(0, 1)(-1);  // run all in current thread
+      } else {
+        int nblocks = std::max(tp_->NumThreads() + 1, 8);
         int block_idx = 0;
-        for (; block_idx < tp_->NumThreads(); block_idx++) {
+        for (; block_idx < nblocks - 1; block_idx++) {
           tp_->AddWork(get_task(block_idx, nblocks), -block_idx);
         }
         tp_->RunAll(false);                 // start work but not wait
         get_task(block_idx, nblocks)(-1);   // run last block
         tp_->WaitForWork();                 // wait for the other threads
-      } else {                              // not worth parallelizing
-        get_task(0, 1)(-1);                 // run all in current thread
       }
 
       for (int orig_idx : decode_sample_idxs_) {