diff --git a/src/ParallelKernel/Data.jl b/src/ParallelKernel/Data.jl
index 828ff0dd..2d88f570 100644
--- a/src/ParallelKernel/Data.jl
+++ b/src/ParallelKernel/Data.jl
@@ -139,12 +139,14 @@ Expands to: `NTuple{N_tuple, Data.Cell{numbertype, S}}` | `NamedTuple{names, NTu
         This datatype is not intended for explicit manual usage. [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray and AMDGPU.ROCArray automatically to CUDA.CuDeviceArray and AMDGPU.ROCDeviceArray in kernels when required.
 """
 
-function Data_cuda(numbertype::DataType, indextype::DataType)
-    if numbertype == NUMBERTYPE_NONE
-        :(baremodule Data # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
+function Data_cuda(modulename::Symbol, numbertype::DataType, indextype::DataType)
+    Data_module = if (numbertype == NUMBERTYPE_NONE)
+        :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
             import Base, CUDA, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
-            CellArrays.@define_CuCellArray
-            export CuCellArray
+            # TODO: the constructors defined by CellArrays.@define_CuCellArray lead to pre-compilation issues due to a bug in Julia. We therefore only create the type alias here for now.
+            const CuCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,CUDA.CuArray{T_elem,CellArrays._N}}
+            # CellArrays.@define_CuCellArray
+            # export CuCellArray
             const Index                         = $indextype
             const Array{T, N}                   = CUDA.CuArray{T, N}
             const DeviceArray{T, N}             = CUDA.CuDeviceArray{T, N}
@@ -152,12 +154,15 @@ function Data_cuda(numbertype::DataType, indextype::DataType)
             const DeviceCell{T, S}              = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}}
             const CellArray{T_elem, N, B}       = CuCellArray{<:Cell{T_elem},N,B,T_elem}
             const DeviceCellArray{T_elem, N, B} = CellArrays.CellArray{<:DeviceCell{T_elem},N,B,<:CUDA.CuDeviceArray{T_elem,CellArrays._N}}
+            $(create_shared_exprs(numbertype, indextype))
         end)
     else
-        :(baremodule Data # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
+        :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
             import Base, CUDA, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
-            CellArrays.@define_CuCellArray
-            export CuCellArray
+            # TODO: the constructors defined by CellArrays.@define_CuCellArray lead to pre-compilation issues due to a bug in Julia. We therefore only create the type alias here for now.
+            const CuCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,CUDA.CuArray{T_elem,CellArrays._N}}
+            # CellArrays.@define_CuCellArray
+            # export CuCellArray
             const Index                          = $indextype
             const Number                         = $numbertype
             const Array{N}                       = CUDA.CuArray{$numbertype, N}
@@ -172,16 +177,20 @@ function Data_cuda(numbertype::DataType, indextype::DataType)
             const DeviceTCell{T, S}              = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}}
             const TCellArray{T_elem, N, B}       = CuCellArray{<:TCell{T_elem},N,B,T_elem}
             const DeviceTCellArray{T_elem, N, B} = CellArrays.CellArray{<:DeviceTCell{T_elem},N,B,<:CUDA.CuDeviceArray{T_elem,CellArrays._N}}
+            $(create_shared_exprs(numbertype, indextype))
         end)
     end
+    return prewalk(rmlines, flatten(Data_module))
 end
 
-function Data_amdgpu(numbertype::DataType, indextype::DataType)
-    if numbertype == NUMBERTYPE_NONE
-        :(baremodule Data # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
+function Data_amdgpu(modulename::Symbol, numbertype::DataType, indextype::DataType)
+    Data_module = if (numbertype == NUMBERTYPE_NONE)
+        :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
             import Base, AMDGPU, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
-            CellArrays.@define_ROCCellArray
-            export ROCCellArray
+            # TODO: the constructors defined by CellArrays.@define_ROCCellArray lead to pre-compilation issues due to a bug in Julia. We therefore only create the type alias here for now.
+            const ROCCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,AMDGPU.ROCArray{T_elem,CellArrays._N}}
+            # CellArrays.@define_ROCCellArray
+            # export ROCCellArray
             const Index                         = $indextype
             const Array{T, N}                   = AMDGPU.ROCArray{T, N}
             const DeviceArray{T, N}             = AMDGPU.ROCDeviceArray{T, N}
@@ -189,12 +198,15 @@ function Data_amdgpu(numbertype::DataType, indextype::DataType)
             const DeviceCell{T, S}              = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}}
             const CellArray{T_elem, N, B}       = ROCCellArray{<:Cell{T_elem},N,B,T_elem}
             const DeviceCellArray{T_elem, N, B} = CellArrays.CellArray{<:DeviceCell{T_elem},N,B,<:AMDGPU.ROCDeviceArray{T_elem,CellArrays._N}}
+            $(create_shared_exprs(numbertype, indextype))
         end)
     else
-        :(baremodule Data # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
+        :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
             import Base, AMDGPU, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
-            CellArrays.@define_ROCCellArray
-            export ROCCellArray
+            # TODO: the constructors defined by CellArrays.@define_ROCCellArray lead to pre-compilation issues due to a bug in Julia. We therefore only create the type alias here for now.
+            const ROCCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,AMDGPU.ROCArray{T_elem,CellArrays._N}}
+            # CellArrays.@define_ROCCellArray
+            # export ROCCellArray
             const Index                          = $indextype
             const Number                         = $numbertype
             const Array{N}                       = AMDGPU.ROCArray{$numbertype, N}
@@ -209,13 +221,15 @@ function Data_amdgpu(numbertype::DataType, indextype::DataType)
             const DeviceTCell{T, S}              = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}}
             const TCellArray{T_elem, N, B}       = ROCCellArray{<:TCell{T_elem},N,B,T_elem}
             const DeviceTCellArray{T_elem, N, B} = CellArrays.CellArray{<:DeviceTCell{T_elem},N,B,<:AMDGPU.ROCDeviceArray{T_elem,CellArrays._N}}
+            $(create_shared_exprs(numbertype, indextype))
         end)
     end
+    return prewalk(rmlines, flatten(Data_module))
 end
 
-function Data_threads(numbertype::DataType, indextype::DataType)
-    if numbertype == NUMBERTYPE_NONE
-        :(baremodule Data # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
+function Data_threads(modulename::Symbol, numbertype::DataType, indextype::DataType)
+    Data_module = if (numbertype == NUMBERTYPE_NONE)
+        :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
             import Base, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
             const Index                          = $indextype
             const Array{T, N}                    = Base.Array{T, N}
@@ -224,9 +238,10 @@ function Data_threads(numbertype::DataType, indextype::DataType)
             const DeviceCell{T, S}               = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}}
             const CellArray{T_elem, N, B}        = CellArrays.CPUCellArray{<:Cell{T_elem},N,B,T_elem}
             const DeviceCellArray{T_elem, N, B}  = CellArrays.CPUCellArray{<:DeviceCell{T_elem},N,B,T_elem}
+            $(create_shared_exprs(numbertype, indextype))
         end)
     else
-        :(baremodule Data # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
+        :(baremodule $modulename # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
             import Base, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
             const Index                          = $indextype
             const Number                         = $numbertype
@@ -242,11 +257,13 @@ function Data_threads(numbertype::DataType, indextype::DataType)
             const DeviceTCell{T, S}              = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}}
             const TCellArray{T_elem, N, B}       = CellArrays.CPUCellArray{<:TCell{T_elem},N,B,T_elem}
             const DeviceTCellArray{T_elem, N, B} = CellArrays.CPUCellArray{<:DeviceTCell{T_elem},N,B,T_elem}
+            $(create_shared_exprs(numbertype, indextype))
         end)
     end
+    return prewalk(rmlines, flatten(Data_module))
 end
 
-function Data_shared(numbertype::DataType, indextype::DataType)
+function create_shared_exprs(numbertype::DataType, indextype::DataType)
     if numbertype == NUMBERTYPE_NONE
         quote
             const IndexTuple{N_tuple}                                      = NTuple{N_tuple, Index}
@@ -276,11 +293,12 @@ function Data_shared(numbertype::DataType, indextype::DataType)
             const CellArrayCollection{N_tuple, T_elem, N, B}               = Union{CellArrayTuple{N_tuple, T_elem, N, B}, NamedCellArrayTuple{N_tuple, T_elem, N, B}}
             const DeviceCellArrayCollection{N_tuple, T_elem, N, B}         = Union{DeviceCellArrayTuple{N_tuple, T_elem, N, B}, NamedDeviceCellArrayTuple{N_tuple, T_elem, N, B}}
 
-            NamedIndexTuple{}(t::NamedTuple)                         = Base.map(Data.Index, t)
-            NamedNumberTuple{}(T, t::NamedTuple)                     = Base.map(T, t)
-            NamedArrayTuple{}(T, t::NamedTuple)                      = Base.map(Data.Array{T}, t)
-            NamedCellTuple{}(T, t::NamedTuple)                       = Base.map(Data.Cell{T}, t)
-            NamedCellArrayTuple{}(T, t::NamedTuple)                  = Base.map(Data.CellArray{T}, t)
+            # TODO: the following constructors lead to pre-compilation issues due to a bug in Julia. They are therefore commented out for now.
+            # NamedIndexTuple{}(t::NamedTuple)                         = Base.map(Data.Index, t)
+            # NamedNumberTuple{}(T, t::NamedTuple)                     = Base.map(T, t)
+            # NamedArrayTuple{}(T, t::NamedTuple)                      = Base.map(Data.Array{T}, t)
+            # NamedCellTuple{}(T, t::NamedTuple)                       = Base.map(Data.Cell{T}, t)
+            # NamedCellArrayTuple{}(T, t::NamedTuple)                  = Base.map(Data.CellArray{T}, t)
         end
     else
         quote
@@ -332,15 +350,16 @@ function Data_shared(numbertype::DataType, indextype::DataType)
             const TCellArrayCollection{N_tuple, T_elem, N, B}              = Union{TCellArrayTuple{N_tuple, T_elem, N, B}, NamedTCellArrayTuple{N_tuple, T_elem, N, B}}
             const DeviceTCellArrayCollection{N_tuple, T_elem, N, B}        = Union{DeviceTCellArrayTuple{N_tuple, T_elem, N, B}, NamedDeviceTCellArrayTuple{N_tuple, T_elem, N, B}}
 
-            NamedIndexTuple{}(t::NamedTuple)                         = Base.map(Data.Index, t)
-            NamedNumberTuple{}(t::NamedTuple)                        = Base.map(Data.Number, t)
-            NamedArrayTuple{}(t::NamedTuple)                         = Base.map(Data.Array, t)
-            NamedCellTuple{}(t::NamedTuple)                          = Base.map(Data.Cell, t)
-            NamedCellArrayTuple{}(t::NamedTuple)                     = Base.map(Data.CellArray, t)
-            NamedTNumberTuple{}(T, t::NamedTuple)                    = Base.map(T, t)
-            NamedTArrayTuple{}(T, t::NamedTuple)                     = Base.map(Data.TArray{T}, t)
-            NamedTCellTuple{}(T, t::NamedTuple)                      = Base.map(Data.TCell{T}, t)
-            NamedTCellArrayTuple{}(T, t::NamedTuple)                 = Base.map(Data.TCellArray{T}, t)
+            # TODO: the following constructors lead to pre-compilation issues due to a bug in Julia. They are therefore commented out for now.
+            # NamedIndexTuple{}(t::NamedTuple)                         = Base.map(Data.Index, t)
+            # NamedNumberTuple{}(t::NamedTuple)                        = Base.map(Data.Number, t)
+            # NamedArrayTuple{}(t::NamedTuple)                         = Base.map(Data.Array, t)
+            # NamedCellTuple{}(t::NamedTuple)                          = Base.map(Data.Cell, t)
+            # NamedCellArrayTuple{}(t::NamedTuple)                     = Base.map(Data.CellArray, t)
+            # NamedTNumberTuple{}(T, t::NamedTuple)                    = Base.map(T, t)
+            # NamedTArrayTuple{}(T, t::NamedTuple)                     = Base.map(Data.TArray{T}, t)
+            # NamedTCellTuple{}(T, t::NamedTuple)                      = Base.map(Data.TCell{T}, t)
+            # NamedTCellArrayTuple{}(T, t::NamedTuple)                 = Base.map(Data.TCellArray{T}, t)
         end
     end
 end
diff --git a/src/ParallelKernel/init_parallel_kernel.jl b/src/ParallelKernel/init_parallel_kernel.jl
index 6a3bbdeb..1f4d0490 100644
--- a/src/ParallelKernel/init_parallel_kernel.jl
+++ b/src/ParallelKernel/init_parallel_kernel.jl
@@ -26,22 +26,20 @@ macro init_parallel_kernel(args...)
 end
 
 function init_parallel_kernel(caller::Module, package::Symbol, numbertype::DataType, inbounds::Bool; datadoc_call=:())
+    modulename = :Data
     if package == PKG_CUDA
-        if (!CUDA_IS_INSTALLED) @NotInstalledError("CUDA was selected as package for parallelization, but CUDA.jl is not installed. CUDA functionality is provided with an extension of ParallelStencil and CUDA.jl needs therefore to be installed independently.") end
+        if (!is_installed("CUDA")) @NotInstalledError("CUDA was selected as package for parallelization, but CUDA.jl is not installed. CUDA functionality is provided with an extension of ParallelStencil and CUDA.jl needs therefore to be installed independently.") end
         indextype          = INT_CUDA
-        data_module        = Data_cuda(numbertype, indextype)
-        data_module_shared = Data_shared(numbertype, indextype)
+        data_module        = Data_cuda(modulename, numbertype, indextype)
         pkg_import_cmd     = :(import CUDA)
     elseif package == PKG_AMDGPU
-        if (!AMDGPU_IS_INSTALLED) @NotInstalledError("AMDGPU was selected as package for parallelization, but AMDGPU.jl is not installed. AMDGPU functionality is provided with an extension of ParallelStencil and AMDGPU.jl needs therefore to be installed independently.") end
+        if (!is_installed("AMDGPU")) @NotInstalledError("AMDGPU was selected as package for parallelization, but AMDGPU.jl is not installed. AMDGPU functionality is provided with an extension of ParallelStencil and AMDGPU.jl needs therefore to be installed independently.") end
         indextype          = INT_AMDGPU
-        data_module        = Data_amdgpu(numbertype, indextype)
-        data_module_shared = Data_shared(numbertype, indextype)
+        data_module        = Data_amdgpu(modulename, numbertype, indextype)
         pkg_import_cmd     = :(import AMDGPU)
     elseif package == PKG_THREADS
         indextype          = INT_THREADS
-        data_module        = Data_threads(numbertype, indextype)
-        data_module_shared = Data_shared(numbertype, indextype)
+        data_module        = Data_threads(modulename, numbertype, indextype)
         pkg_import_cmd     = :()
     end
     ad_init_cmd = :(ParallelStencil.ParallelKernel.AD.init_AD(ParallelStencil.ParallelKernel.PKG_THREADS))
@@ -53,7 +51,6 @@ function init_parallel_kernel(caller::Module, package::Symbol, numbertype::DataT
         end
         @eval(caller, $pkg_import_cmd)
         @eval(caller, $data_module)
-        @eval(caller.Data, $data_module_shared)
         @eval(caller, $datadoc_call)
     elseif isdefined(caller, :Data) && isdefined(caller.Data, :DeviceArray)
         if !isinteractive() @warn "Module Data from previous module initialization found in caller module ($caller); module Data not created. Note: this warning is only shown in non-interactive mode." end
diff --git a/src/ParallelKernel/shared.jl b/src/ParallelKernel/shared.jl
index e0fb81fa..3a7a6e6f 100644
--- a/src/ParallelKernel/shared.jl
+++ b/src/ParallelKernel/shared.jl
@@ -1,5 +1,5 @@
 using CellArrays, StaticArrays, MacroTools
-import MacroTools: postwalk, splitdef, combinedef, isexpr, unblock # NOTE: inexpr_walk used instead of MacroTools.inexpr
+import MacroTools: postwalk, splitdef, combinedef, isexpr, unblock, flatten, rmlines, prewalk # NOTE: inexpr_walk used instead of MacroTools.inexpr
 
 
 ## CONSTANTS AND TYPES (and the macros wrapping them)
@@ -10,8 +10,6 @@ gensym_world(tag::String, generator::Module) = gensym(string(tag, GENSYM_SEPARAT
 gensym_world(tag::Symbol, generator::Module) = gensym(string(tag, GENSYM_SEPARATOR, generator))
 gensym_world(tag::Expr,   generator::Module) = gensym(string(tag, GENSYM_SEPARATOR, generator))
 
-const CUDA_IS_INSTALLED            = (Base.find_package("CUDA")!==nothing)
-const AMDGPU_IS_INSTALLED          = (Base.find_package("AMDGPU")!==nothing)
 const PKG_CUDA                     = :CUDA
 const PKG_AMDGPU                   = :AMDGPU
 const PKG_THREADS                  = :Threads
@@ -68,10 +66,10 @@ macro ranges()       esc(RANGES_VARNAME) end
 macro rangelengths() esc(:(($(RANGELENGTHS_VARNAMES...),))) end
 
 
-
 ## FUNCTIONS TO CHECK EXTENSIONS SUPPORT
 
 is_loaded(arg) = false 
+is_installed(package::String) = (Base.find_package(package)!==nothing)
 
 
 ## FUNCTIONS TO DEAL WITH KERNEL DEFINITIONS: SIGNATURES, BODY AND RETURN STATEMENT