diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 524e5752b7..540a7d3c73 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -8,9 +8,19 @@ on: jobs: pre-commit: runs-on: ubuntu-20.04 + defaults: + run: + shell: bash -el {0} steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 + - uses: conda-incubator/setup-miniconda@v2 with: python-version: '3.11' - - uses: pre-commit/action@v3.0.0 + activate-environment: "coverage" + channel-priority: "disabled" + environment-file: environment/pre-commit.yml + - uses: actions/cache@v3 + with: + path: ~/.cache/pre-commit + key: pre-commit-3|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }} + - run: pre-commit run --show-diff-on-failure --color=always --all-files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 19615e0f10..1753b7a3fe 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,3 +47,17 @@ repos: args: ["-i"] exclude: "numba_dpex/dpnp_iface" types_or: [c++, c] +- repo: local + hooks: + - id: pylint + name: pylint + entry: pylint + files: ^numba_dpex/experimental + language: system + types: [python] + require_serial: true + args: + [ + "-rn", # Only display messages + "-sn", # Don't display the score + ] diff --git a/environment/pre-commit.yml b/environment/pre-commit.yml new file mode 100644 index 0000000000..3cfb9024b6 --- /dev/null +++ b/environment/pre-commit.yml @@ -0,0 +1,25 @@ +name: dev +channels: + - dppy/label/dev + - numba + - intel + - conda-forge + - nodefaults +dependencies: + - libffi + - gxx_linux-64 + - dpcpp_linux-64 + - numba==0.58* + - dpctl + - dpnp + - dpcpp-llvm-spirv + - opencl_rt + - coverage + - pytest + - pytest-cov + - pytest-xdist + - pexpect + - scikit-build>=0.15* + - cmake>=3.26* + - pre-commit + - pylint diff --git a/numba_dpex/config.py b/numba_dpex/config.py index fef20002a5..ece1ba8ecd 100644 --- a/numba_dpex/config.py +++ b/numba_dpex/config.py @@ -59,10 +59,15 @@ def __getattr__(name): "NUMBA_DPEX_DEBUGINFO", int, config.DEBUGINFO_DEFAULT ) -# Emit LLVM assembly language format(.ll) -DUMP_KERNEL_LLVM = _readenv( - "NUMBA_DPEX_DUMP_KERNEL_LLVM", int, config.DUMP_OPTIMIZED -) +# Emit LLVM IR generated for kernel decorated function +DUMP_KERNEL_LLVM = _readenv("NUMBA_DPEX_DUMP_KERNEL_LLVM", int, 0) + +# Emit LLVM module generated to launch a kernel decorated function +DUMP_KERNEL_LAUNCHER = _readenv("NUMBA_DPEX_DUMP_KERNEL_LAUNCHER", int, 0) + +# Enables debug printf messages inside the kernel launcher module generated for +# a kernel decorated function +DEBUG_KERNEL_LAUNCHER = _readenv("NUMBA_DPEX_DEBUG_KERNEL_LAUNCHER", int, 0) # configs for caching # To see the debug messages for the caching. diff --git a/numba_dpex/core/descriptor.py b/numba_dpex/core/descriptor.py index 6d40686289..406f7115f8 100644 --- a/numba_dpex/core/descriptor.py +++ b/numba_dpex/core/descriptor.py @@ -38,11 +38,13 @@ def _inherit_if_not_set(flags, options, name, default=targetconfig._NotSet): class DpexTargetOptions(CPUTargetOptions): experimental = _option_mapping("experimental") release_gil = _option_mapping("release_gil") + no_compile = _option_mapping("no_compile") def finalize(self, flags, options): super().finalize(flags, options) _inherit_if_not_set(flags, options, "experimental", False) _inherit_if_not_set(flags, options, "release_gil", False) + _inherit_if_not_set(flags, options, "no_compile", True) class DpexKernelTarget(TargetDescriptor): diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py index 8d5c68366f..aa755c0f77 100644 --- a/numba_dpex/core/exceptions.py +++ b/numba_dpex/core/exceptions.py @@ -215,6 +215,12 @@ def __init__(self, kernel_name, *, usmarray_argnum_list) -> None: f"usm_ndarray arguments {usmarray_args} were not allocated " "on the same queue." ) + else: + self.message = ( + f'Execution queue for kernel "{kernel_name}" could ' + "be deduced using compute follows data programming model. The " + "kernel has no USMNdArray argument." + ) super().__init__(self.message) diff --git a/numba_dpex/core/parfors/parfor_lowerer.py b/numba_dpex/core/parfors/parfor_lowerer.py index eb8e9e1d87..bcc3ae2082 100644 --- a/numba_dpex/core/parfors/parfor_lowerer.py +++ b/numba_dpex/core/parfors/parfor_lowerer.py @@ -3,9 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 import copy +from collections import namedtuple from llvmlite import ir as llvmir -from numba.core import ir, types +from numba.core import cgutils, ir, types from numba.parfors.parfor import ( find_potential_aliases_parfor, get_parfor_outputs, @@ -27,6 +28,12 @@ create_reduction_remainder_kernel_for_parfor, ) +_KernelArgs = namedtuple( + "_KernelArgs", + ["num_flattened_args", "arg_vals", "arg_types"], +) + + # A global list of kernels to keep the objects alive indefinitely. keep_alive_kernels = [] @@ -84,21 +91,7 @@ class ParforLowerImpl: for a parfor and submits it to a queue. """ - def _get_exec_queue(self, kernel_fn, lowerer): - """Creates a stack variable storing the sycl queue pointer used to - launch the kernel function. - """ - self.kernel_builder = KernelLaunchIRBuilder( - lowerer.context, lowerer.builder, kernel_fn.kernel.addressof_ref() - ) - - # Create a local variable storing a pointer to a DPCTLSyclQueueRef - # pointer. - self.curr_queue = self.kernel_builder.get_queue( - exec_queue=kernel_fn.queue - ) - - def _build_kernel_arglist(self, kernel_fn, lowerer): + def _build_kernel_arglist(self, kernel_fn, lowerer, kernel_builder): """Creates local variables for all the arguments and the argument types that are passes to the kernel function. @@ -110,7 +103,7 @@ def _build_kernel_arglist(self, kernel_fn, lowerer): AssertionError: If the LLVM IR Value for an argument defined in Numba IR is not found. """ - self.num_flattened_args = 0 + num_flattened_args = 0 # Compute number of args to be passed to the kernel. Note that the # actual number of kernel arguments is greater than the count of @@ -118,31 +111,35 @@ def _build_kernel_arglist(self, kernel_fn, lowerer): for arg_type in kernel_fn.kernel_arg_types: if isinstance(arg_type, DpnpNdArray): datamodel = dpex_dmm.lookup(arg_type) - self.num_flattened_args += datamodel.flattened_field_count + num_flattened_args += datamodel.flattened_field_count elif arg_type == types.complex64 or arg_type == types.complex128: - self.num_flattened_args += 2 + num_flattened_args += 2 else: - self.num_flattened_args += 1 + num_flattened_args += 1 # Create LLVM values for the kernel args list and kernel arg types list - self.args_list = self.kernel_builder.allocate_kernel_arg_array( - self.num_flattened_args - ) - self.args_ty_list = self.kernel_builder.allocate_kernel_arg_ty_array( - self.num_flattened_args + args_list = kernel_builder.allocate_kernel_arg_array(num_flattened_args) + args_ty_list = kernel_builder.allocate_kernel_arg_ty_array( + num_flattened_args ) callargs_ptrs = [] for arg in kernel_fn.kernel_args: callargs_ptrs.append(_getvar(lowerer, arg)) - self.kernel_builder.populate_kernel_args_and_args_ty_arrays( + kernel_builder.populate_kernel_args_and_args_ty_arrays( kernel_argtys=kernel_fn.kernel_arg_types, callargs_ptrs=callargs_ptrs, - args_list=self.args_list, - args_ty_list=self.args_ty_list, + args_list=args_list, + args_ty_list=args_ty_list, datamodel_mgr=dpex_dmm, ) + return _KernelArgs( + num_flattened_args=num_flattened_args, + arg_vals=args_list, + arg_types=args_ty_list, + ) + def _submit_parfor_kernel( self, lowerer, @@ -156,9 +153,11 @@ def _submit_parfor_kernel( # Ensure that the Python arguments are kept alive for the duration of # the kernel execution keep_alive_kernels.append(kernel_fn.kernel) + kernel_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder) + + ptr_to_queue_ref = kernel_builder.get_queue(exec_queue=kernel_fn.queue) + args = self._build_kernel_arglist(kernel_fn, lowerer, kernel_builder) - self._get_exec_queue(kernel_fn, lowerer) - self._build_kernel_arglist(kernel_fn, lowerer) # Create a global range over which to submit the kernel based on the # loop_ranges of the parfor global_range = [] @@ -178,18 +177,26 @@ def _submit_parfor_kernel( local_range = [] + kernel_ref_addr = kernel_fn.kernel.addressof_ref() + kernel_ref = lowerer.builder.inttoptr( + lowerer.context.get_constant(types.uintp, kernel_ref_addr), + cgutils.voidptr_t, + ) + curr_queue_ref = lowerer.builder.load(ptr_to_queue_ref) + # Submit a synchronous kernel - self.kernel_builder.submit_sync_kernel( - self.curr_queue, - self.num_flattened_args, - self.args_list, - self.args_ty_list, - global_range, - local_range, + kernel_builder.submit_sycl_kernel( + sycl_kernel_ref=kernel_ref, + sycl_queue_ref=curr_queue_ref, + total_kernel_args=args.num_flattened_args, + arg_list=args.arg_vals, + arg_ty_list=args.arg_types, + global_range=global_range, + local_range=local_range, ) # At this point we can free the DPCTLSyclQueueRef (curr_queue) - self.kernel_builder.free_queue(sycl_queue_val=self.curr_queue) + kernel_builder.free_queue(ptr_to_sycl_queue_ref=ptr_to_queue_ref) def _submit_reduction_main_parfor_kernel( self, @@ -204,9 +211,11 @@ def _submit_reduction_main_parfor_kernel( # Ensure that the Python arguments are kept alive for the duration of # the kernel execution keep_alive_kernels.append(kernel_fn.kernel) + kernel_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder) + + ptr_to_queue_ref = kernel_builder.get_queue(exec_queue=kernel_fn.queue) - self._get_exec_queue(kernel_fn, lowerer) - self._build_kernel_arglist(kernel_fn, lowerer) + args = self._build_kernel_arglist(kernel_fn, lowerer, kernel_builder) # Create a global range over which to submit the kernel based on the # loop_ranges of the parfor global_range = [] @@ -220,16 +229,27 @@ def _submit_reduction_main_parfor_kernel( _load_range(lowerer, reductionHelper.work_group_size) ) + kernel_ref_addr = kernel_fn.kernel.addressof_ref() + kernel_ref = lowerer.builder.inttoptr( + lowerer.context.get_constant(types.uintp, kernel_ref_addr), + cgutils.voidptr_t, + ) + curr_queue_ref = lowerer.builder.load(ptr_to_queue_ref) + # Submit a synchronous kernel - self.kernel_builder.submit_sync_kernel( - self.curr_queue, - self.num_flattened_args, - self.args_list, - self.args_ty_list, - global_range, - local_range, + kernel_builder.submit_sycl_kernel( + sycl_kernel_ref=kernel_ref, + sycl_queue_ref=curr_queue_ref, + total_kernel_args=args.num_flattened_args, + arg_list=args.arg_vals, + arg_ty_list=args.arg_types, + global_range=global_range, + local_range=local_range, ) + # At this point we can free the DPCTLSyclQueueRef (curr_queue) + kernel_builder.free_queue(ptr_to_sycl_queue_ref=ptr_to_queue_ref) + def _submit_reduction_remainder_parfor_kernel( self, lowerer, @@ -243,8 +263,11 @@ def _submit_reduction_remainder_parfor_kernel( # the kernel execution keep_alive_kernels.append(kernel_fn.kernel) - self._get_exec_queue(kernel_fn, lowerer) - self._build_kernel_arglist(kernel_fn, lowerer) + kernel_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder) + + ptr_to_queue_ref = kernel_builder.get_queue(exec_queue=kernel_fn.queue) + + args = self._build_kernel_arglist(kernel_fn, lowerer, kernel_builder) # Create a global range over which to submit the kernel based on the # loop_ranges of the parfor global_range = [] @@ -255,16 +278,27 @@ def _submit_reduction_remainder_parfor_kernel( local_range = [] + kernel_ref_addr = kernel_fn.kernel.addressof_ref() + kernel_ref = lowerer.builder.inttoptr( + lowerer.context.get_constant(types.uintp, kernel_ref_addr), + cgutils.voidptr_t, + ) + curr_queue_ref = lowerer.builder.load(ptr_to_queue_ref) + # Submit a synchronous kernel - self.kernel_builder.submit_sync_kernel( - self.curr_queue, - self.num_flattened_args, - self.args_list, - self.args_ty_list, - global_range, - local_range, + kernel_builder.submit_sycl_kernel( + sycl_kernel_ref=kernel_ref, + sycl_queue_ref=curr_queue_ref, + total_kernel_args=args.num_flattened_args, + arg_list=args.arg_vals, + arg_ty_list=args.arg_types, + global_range=global_range, + local_range=local_range, ) + # At this point we can free the DPCTLSyclQueueRef (curr_queue) + kernel_builder.free_queue(ptr_to_sycl_queue_ref=ptr_to_queue_ref) + def _reduction_codegen( self, parfor, diff --git a/numba_dpex/core/parfors/reduction_helper.py b/numba_dpex/core/parfors/reduction_helper.py index 31cd8ff618..93d9a147c0 100644 --- a/numba_dpex/core/parfors/reduction_helper.py +++ b/numba_dpex/core/parfors/reduction_helper.py @@ -395,11 +395,7 @@ def work_group_size(self): def copy_final_sum_to_host(self, parfor_kernel): lowerer = self.lowerer - ir_builder = KernelLaunchIRBuilder( - lowerer.context, - lowerer.builder, - parfor_kernel.kernel.addressof_ref(), - ) + ir_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder) # Create a local variable storing a pointer to a DPCTLSyclQueueRef # pointer. @@ -447,4 +443,4 @@ def copy_final_sum_to_host(self, parfor_kernel): sycl.dpctl_event_wait(builder, event_ref) sycl.dpctl_event_delete(builder, event_ref) - ir_builder.free_queue(sycl_queue_val=curr_queue) + ir_builder.free_queue(ptr_to_sycl_queue_ref=curr_queue) diff --git a/numba_dpex/core/runtime/_dpexrt_python.c b/numba_dpex/core/runtime/_dpexrt_python.c index d6e4798013..91f695ee4e 100644 --- a/numba_dpex/core/runtime/_dpexrt_python.c +++ b/numba_dpex/core/runtime/_dpexrt_python.c @@ -467,9 +467,9 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj, mi->size = nitems * itemsize; mi->external_allocator = ext_alloca; - DPEXRT_DEBUG(drt_debug_print( - "DPEXRT-DEBUG: NRT_MemInfo_init mi=%p external_allocator=%p\n", mi, - ext_alloca)); + DPEXRT_DEBUG(drt_debug_print("DPEXRT-DEBUG: NRT_MemInfo_init mi=%p " + "external_allocator=%p at %s, line %d\n", + mi, ext_alloca, __FILE__, __LINE__)); return mi; @@ -797,14 +797,15 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj, Py_IncRef(obj); DPEXRT_DEBUG(drt_debug_print( - "DPEXRT-DEBUG: In DPEXRT_sycl_usm_ndarray_from_python.\n")); + "DPEXRT-DEBUG: In DPEXRT_sycl_usm_ndarray_from_python at %s, line %d\n", + __FILE__, __LINE__)); // Check if the PyObject obj has an _array_obj attribute that is of // dpctl.tensor.usm_ndarray type. if (!(arrayobj = PyUSMNdArray_ARRAYOBJ(obj))) { DPEXRT_DEBUG(drt_debug_print( - "DPEXRT-ERROR: PyUSMNdArray_ARRAYOBJ check failed %d\n", __FILE__, - __LINE__)); + "DPEXRT-ERROR: PyUSMNdArray_ARRAYOBJ check failed at %s, line %d\n", + __FILE__, __LINE__)); goto error; } @@ -874,6 +875,11 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj, } } + DPEXRT_DEBUG( + drt_debug_print("DPEXRT-DEBUG: Done with unboxing call to " + "DPEXRT_sycl_usm_ndarray_from_python at %s, line %d\n", + __FILE__, __LINE__)); + return 0; error: diff --git a/numba_dpex/core/utils/kernel_launcher.py b/numba_dpex/core/utils/kernel_launcher.py index 79c13254c8..143ba1b3d5 100644 --- a/numba_dpex/core/utils/kernel_launcher.py +++ b/numba_dpex/core/utils/kernel_launcher.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +from llvmlite import ir as llvmir from numba.core import cgutils, types from numba_dpex import utils @@ -20,17 +21,16 @@ class KernelLaunchIRBuilder: for submitting kernels. The LLVM Values that """ - def __init__(self, context, builder, kernel_addr): + def __init__(self, context, builder): """Create a KernelLauncher for the specified kernel. Args: - context: A Numba target context that will be used to generate the code. + context: A Numba target context that will be used to generate the + code. builder: An llvmlite IRBuilder instance used to generate LLVM IR. - kernel_addr: The address of a SYCL kernel. """ self.context = context self.builder = builder - self.kernel_addr = kernel_addr self.rtctx = DpexRTContext(self.context) def _build_nullptr(self): @@ -275,7 +275,7 @@ def get_queue(self, exec_queue): ) return sycl_queue_val - def free_queue(self, sycl_queue_val): + def free_queue(self, ptr_to_sycl_queue_ref): """ Frees the ``DPCTLSyclQueueRef`` pointer that was used to launch the kernels. @@ -283,7 +283,7 @@ def free_queue(self, sycl_queue_val): Args: sycl_queue_val: The SYCL queue pointer to be freed. """ - qref = self.builder.load(sycl_queue_val) + qref = self.builder.load(ptr_to_sycl_queue_ref) sycl.dpctl_queue_delete(self.builder, qref) def allocate_kernel_arg_array(self, num_kernel_args): @@ -324,20 +324,24 @@ def allocate_kernel_arg_ty_array(self, num_kernel_args): return args_ty_list def _create_sycl_range(self, idx_range): - """_summary_ + """Allocate a size_t[3] array to store the extents of a sycl::range. + + Sycl supports upto 3-dimensional ranges and a such the array is + statically sized to length three. Only the elements that store an actual + range value are populated based on the size of the idx_range argument. - Args: - idx_range (_type_): _description_ """ intp_t = utils.get_llvm_type(context=self.context, type=types.intp) intp_ptr_t = utils.get_llvm_ptr_type(intp_t) num_dim = len(idx_range) + MAX_SIZE_OF_SYCL_RANGE = 3 + # form the global range - global_range = cgutils.alloca_once( + range_list = cgutils.alloca_once( self.builder, utils.get_llvm_type(context=self.context, type=types.uintp), - size=self.context.get_constant(types.uintp, num_dim), + size=self.context.get_constant(types.uintp, MAX_SIZE_OF_SYCL_RANGE), ) for i in range(num_dim): @@ -350,32 +354,32 @@ def _create_sycl_range(self, idx_range): self.builder.store( rext, self.builder.gep( - global_range, + range_list, [self.context.get_constant(types.uintp, (num_dim - 1) - i)], ), ) - return self.builder.bitcast(global_range, intp_ptr_t) + return self.builder.bitcast(range_list, intp_ptr_t) - def submit_sync_kernel( + def submit_sycl_kernel( self, - sycl_queue_val, + sycl_kernel_ref, + sycl_queue_ref, total_kernel_args, arg_list, arg_ty_list, global_range, - local_range=None, - ): + local_range=[], + wait_before_return=True, + ) -> llvmir.PointerType(llvmir.IntType(8)): """ Submits the kernel to the specified queue, waits. """ + eref = None gr = self._create_sycl_range(global_range) args1 = [ - self.builder.inttoptr( - self.context.get_constant(types.uintp, self.kernel_addr), - utils.get_llvm_type(context=self.context, type=types.voidptr), - ), - self.builder.load(sycl_queue_val), + sycl_kernel_ref, + sycl_queue_ref, arg_list, arg_ty_list, self.context.get_constant(types.uintp, total_kernel_args), @@ -394,11 +398,18 @@ def submit_sync_kernel( args = [] if len(local_range) == 0: args = args1 + args2 - self.rtctx.submit_range(self.builder, *args) + eref = sycl.dpctl_queue_submit_range(self.builder, *args) else: lr = self._create_sycl_range(local_range) args = args1 + [lr] + args2 - self.rtctx.submit_ndrange(self.builder, *args) + eref = sycl.dpctl_queue_submit_ndrange(self.builder, *args) + + if wait_before_return: + sycl.dpctl_event_wait(self.builder, eref) + sycl.dpctl_event_delete(self.builder, eref) + return None + else: + return eref def populate_kernel_args_and_args_ty_arrays( self, diff --git a/numba_dpex/dpctl_iface/libsyclinterface_bindings.py b/numba_dpex/dpctl_iface/libsyclinterface_bindings.py index c515d4a96f..9975e96d0f 100644 --- a/numba_dpex/dpctl_iface/libsyclinterface_bindings.py +++ b/numba_dpex/dpctl_iface/libsyclinterface_bindings.py @@ -30,6 +30,9 @@ def _build_dpctl_function(llvm_module, return_ty, arg_list, func_name): return fn +# --------------------- DpctlSyclQueue functions ------------------------------# + + def dpctl_queue_copy(builder: llvmir.IRBuilder, *args): """Inserts LLVM IR to call DPCTLQueue_Copy to create a copy of a DpctlSyclQueueRef pointer passed in to the function. @@ -66,42 +69,255 @@ def dpctl_queue_memcpy(builder: llvmir.IRBuilder, *args): return ret -def dpctl_event_wait(builder: llvmir.IRBuilder, *args): - """Inserts LLVM IR to call DPCTLEvent_Wait.""" +def dpctl_queue_delete(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLQueue_Delete.""" mod = builder.module fn = _build_dpctl_function( llvm_module=mod, - return_ty=llvmir.VoidType(), + return_ty=llvmir.types.VoidType(), arg_list=[cgutils.voidptr_t], - func_name="DPCTLEvent_Wait", + func_name="DPCTLQueue_Delete", ) ret = builder.call(fn, args) return ret -def dpctl_event_delete(builder: llvmir.IRBuilder, *args): - """Inserts LLVM IR to call DPCTLEvent_Delete.""" +def dpctl_queue_get_context(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLQueue_GetContext.""" + mod = builder.module + fn = _build_dpctl_function( + llvm_module=mod, + return_ty=cgutils.voidptr_t, + arg_list=[cgutils.voidptr_t], + func_name="DPCTLQueue_GetContext", + ) + ret = builder.call(fn, args) + + return ret + + +def dpctl_queue_get_device(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLQueue_GetDevice.""" + mod = builder.module + fn = _build_dpctl_function( + llvm_module=mod, + return_ty=cgutils.voidptr_t, + arg_list=[cgutils.voidptr_t], + func_name="DPCTLQueue_GetDevice", + ) + ret = builder.call(fn, args) + + return ret + + +def dpctl_queue_submit_range(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLQueue_SubmitRange. + + DPCTLSyclEventRef + DPCTLQueue_SubmitRange( + const DPCTLSyclKernelRef KRef, + const DPCTLSyclQueueRef QRef, + void** Args, + const DPCTLKernelArgType* ArgTypes, + size_t NArgs, + const size_t Range[3], + size_t NRange, + const DPCTLSyclEventRef* DepEvents, + size_t NDepEvents + ); + + """ + mod = builder.module + fn = _build_dpctl_function( + llvm_module=mod, + return_ty=cgutils.voidptr_t, + arg_list=[ + cgutils.voidptr_t, + cgutils.voidptr_t, + cgutils.voidptr_t.as_pointer(), + cgutils.int32_t.as_pointer(), + llvmir.IntType(64), + llvmir.IntType(64).as_pointer(), + llvmir.IntType(64), + cgutils.voidptr_t, + llvmir.IntType(64), + ], + func_name="DPCTLQueue_SubmitRange", + ) + ret = builder.call(fn, args) + + return ret + + +def dpctl_queue_submit_ndrange(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLQueue_SubmitNDRange. + + DPCTLSyclEventRef + DPCTLQueue_SubmitNDRange( + const DPCTLSyclKernelRef KRef, + const DPCTLSyclQueueRef QRef, + void** Args, + const DPCTLKernelArgType* ArgTypes, + size_t NArgs, + const size_t gRange[3], + const size_t lRange[3], + size_t NDims, + const DPCTLSyclEventRef* DepEvents, + size_t NDepEvents + ); + + """ + mod = builder.module + fn = _build_dpctl_function( + llvm_module=mod, + return_ty=cgutils.voidptr_t, + arg_list=[ + cgutils.voidptr_t, + cgutils.voidptr_t, + cgutils.voidptr_t.as_pointer(), + cgutils.int32_t.as_pointer(), + llvmir.IntType(64), + llvmir.IntType(64).as_pointer(), + llvmir.IntType(64).as_pointer(), + llvmir.IntType(64), + cgutils.voidptr_t, + llvmir.IntType(64), + ], + func_name="DPCTLQueue_SubmitNDRange", + ) + ret = builder.call(fn, args) + + return ret + + +# --------------------- DpctlSyclDevice functions -----------------------------# + + +def dpctl_device_delete(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLDevice_Delete.""" + mod = builder.module + fn = _build_dpctl_function( + llvm_module=mod, + return_ty=llvmir.types.VoidType(), + arg_list=[cgutils.voidptr_t], + func_name="DPCTLDevice_Delete", + ) + ret = builder.call(fn, args) + + return ret + + +# --------------------- DpctlSyclKernelBundle functions -----------------------# + + +def dpctl_kernel_bundle_create_from_spirv(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLKernelBundle_CreateFromSpirv.""" + mod = builder.module + fn = _build_dpctl_function( + llvm_module=mod, + return_ty=cgutils.voidptr_t, + arg_list=[ + cgutils.voidptr_t, + cgutils.voidptr_t, + cgutils.voidptr_t, + llvmir.IntType(64), + cgutils.voidptr_t, + ], + func_name="DPCTLKernelBundle_CreateFromSpirv", + ) + ret = builder.call(fn, args) + + return ret + + +def dpctl_kernel_bundle_delete(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLKernelBundle_Delete.""" + mod = builder.module + fn = _build_dpctl_function( + llvm_module=mod, + return_ty=llvmir.types.VoidType(), + arg_list=[cgutils.voidptr_t], + func_name="DPCTLKernelBundle_Delete", + ) + ret = builder.call(fn, args) + + return ret + + +def dpctl_kernel_bundle_get_kernel(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLKernelBundle_GetKernel.""" + mod = builder.module + fn = _build_dpctl_function( + llvm_module=mod, + return_ty=cgutils.voidptr_t, + arg_list=[cgutils.voidptr_t, cgutils.voidptr_t], + func_name="DPCTLKernelBundle_GetKernel", + ) + ret = builder.call(fn, args) + + return ret + + +# --------------------- DpctlSyclKernel functions -----------------------------# + + +def dpctl_kernel_delete(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLKernel_Delete.""" + mod = builder.module + fn = _build_dpctl_function( + llvm_module=mod, + return_ty=llvmir.types.VoidType(), + arg_list=[cgutils.voidptr_t], + func_name="DPCTLKernel_Delete", + ) + ret = builder.call(fn, args) + + return ret + + +# --------------------- DpctlSyclContext functions ----------------------------# + + +def dpctl_context_delete(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLContext_Delete.""" + mod = builder.module + fn = _build_dpctl_function( + llvm_module=mod, + return_ty=llvmir.types.VoidType(), + arg_list=[cgutils.voidptr_t], + func_name="DPCTLContext_Delete", + ) + ret = builder.call(fn, args) + + return ret + + +# --------------------- DpctlSyclEvent functions ------------------------------# + + +def dpctl_event_wait(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLEvent_Wait.""" mod = builder.module fn = _build_dpctl_function( llvm_module=mod, return_ty=llvmir.VoidType(), arg_list=[cgutils.voidptr_t], - func_name="DPCTLEvent_Delete", + func_name="DPCTLEvent_Wait", ) ret = builder.call(fn, args) return ret -def dpctl_queue_delete(builder: llvmir.IRBuilder, *args): - """Inserts LLVM IR to call DPCTLQueue_Delete.""" +def dpctl_event_delete(builder: llvmir.IRBuilder, *args): + """Inserts LLVM IR to call DPCTLEvent_Delete.""" mod = builder.module fn = _build_dpctl_function( llvm_module=mod, return_ty=llvmir.VoidType(), arg_list=[cgutils.voidptr_t], - func_name="DPCTLQueue_Delete", + func_name="DPCTLEvent_Delete", ) ret = builder.call(fn, args) diff --git a/numba_dpex/experimental/__init__.py b/numba_dpex/experimental/__init__.py new file mode 100644 index 0000000000..589d10df5d --- /dev/null +++ b/numba_dpex/experimental/__init__.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +"""Contains experimental features that are meant as engineering preview and not +yet production ready. +""" + +from numba.core.imputils import Registry + +from .decorators import kernel +from .kernel_dispatcher import KernelDispatcher +from .launcher import call_kernel +from .models import * +from .types import KernelDispatcherType + +registry = Registry() +lower_constant = registry.lower_constant + + +@lower_constant(KernelDispatcherType) +def dpex_dispatcher_const(context): + """Dummy lowerer for a KernelDispatcherType object. It is added so that a + KernelDispatcher can be passed as an argument to dpjit. + """ + return context.get_dummy_value() + + +__all__ = ["kernel", "KernelDispatcher", "call_kernel"] diff --git a/numba_dpex/experimental/decorators.py b/numba_dpex/experimental/decorators.py new file mode 100644 index 0000000000..22565eb1cb --- /dev/null +++ b/numba_dpex/experimental/decorators.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +""" The set of experimental decorators provided by numba_dpex that are not yet +ready to move to numba_dpex.core. +""" +import inspect + +from numba.core import sigutils + +from .kernel_dispatcher import KernelDispatcher + + +def kernel(func_or_sig=None, debug=False, cache=False, **options): + """A decorator to define a kernel function. + + A kernel function is conceptually equivalent to a SYCL kernel function, and + gets compiled into either an OpenCL or a LevelZero SPIR-V binary kernel. + A kernel decorated Python function has the following restrictions: + + * The function can not return any value. + * All array arguments passed to a kernel should adhere to compute + follows data programming model. + """ + # FIXME: The options need to be evaluated and checked here like it is + # done in numba.core.decorators.jit + + def _kernel_dispatcher(pyfunc, sigs=None): + return KernelDispatcher( + pyfunc=pyfunc, + debug_flags=debug, + enable_cache=cache, + specialization_sigs=sigs, + targetoptions=options, + ) + + if func_or_sig is None: + return _kernel_dispatcher + + if isinstance(func_or_sig, str): + raise NotImplementedError( + "Specifying signatures as string is not yet supported by numba-dpex" + ) + + if isinstance(func_or_sig, list) or sigutils.is_signature(func_or_sig): + # String signatures are not supported as passing usm_ndarray type as + # a string is not possible. Numba's sigutils relies on the type being + # available in Numba's `types.__dict__` and dpex types are not + # registered there yet. + if isinstance(func_or_sig, list): + for sig in func_or_sig: + if isinstance(sig, str): + raise NotImplementedError( + "Specifying signatures as string is not yet supported " + "by numba-dpex" + ) + # Specialized signatures can either be a single signature or a list. + # In case only one signature is provided convert it to a list + if not isinstance(func_or_sig, list): + func_or_sig = [func_or_sig] + + def _specialized_kernel_dispatcher(pyfunc): + return KernelDispatcher( + pyfunc=pyfunc, + debug_flags=debug, + enable_cache=cache, + specialization_sigs=func_or_sig, + ) + + return _specialized_kernel_dispatcher + func = func_or_sig + if not inspect.isfunction(func): + raise ValueError( + "Argument passed to the kernel decorator is neither a " + "function object, nor a signature. If you are trying to " + "specialize the kernel that takes a single argument, specify " + "the return type as void explicitly." + ) + return _kernel_dispatcher(func) diff --git a/numba_dpex/experimental/kernel_dispatcher.py b/numba_dpex/experimental/kernel_dispatcher.py new file mode 100644 index 0000000000..9f23274848 --- /dev/null +++ b/numba_dpex/experimental/kernel_dispatcher.py @@ -0,0 +1,346 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +"""Implements a new numba dispatcher class and a compiler class to compile and +call numba_dpex.kernel decorated function. +""" +import functools +from collections import Counter, OrderedDict, namedtuple +from contextlib import ExitStack + +import numba.core.event as ev +from numba.core import errors, sigutils, types, utils +from numba.core.caching import NullCache +from numba.core.compiler import CompileResult +from numba.core.compiler_lock import global_compiler_lock +from numba.core.dispatcher import Dispatcher, _DispatcherBase, _FunctionCompiler +from numba.core.typing.typeof import Purpose, typeof + +from numba_dpex import config, spirv_generator +from numba_dpex.core.descriptor import dpex_kernel_target +from numba_dpex.core.exceptions import ( + ExecutionQueueInferenceError, + UnsupportedKernelArgumentError, +) +from numba_dpex.core.pipelines import kernel_compiler +from numba_dpex.core.types import DpnpNdArray + +_KernelModule = namedtuple("_KernelModule", ["kernel_name", "kernel_bitcode"]) + +_KernelCompileResult = namedtuple( + "_KernelCompileResult", + ["status", "cres_or_error", "entry_point"], +) + + +class _KernelCompiler(_FunctionCompiler): + """A special compiler class used to compile numba_dpex.kernel decorated + functions. + """ + + def check_queue_equivalence_of_args( + self, py_func_name: str, args: [types.Type, ...] + ): + """Evaluates if all DpnpNdArray arguments passed to a kernel function + has the same DpctlSyclQueue type. + + Args: + py_func_name (str): Name of the kernel that is being evaluated + args (types.Type, ...]): List of numba inferred types for each + argument passed to the kernel + + Raises: + ExecutionQueueInferenceError: If all DpnpNdArray were not allocated + on the same dpctl.SyclQueue + ExecutionQueueInferenceError: If there were not DpnpNdArray + arguments passed to the kernel. + """ + common_queue = None + + for arg in args: + if isinstance(arg, DpnpNdArray): + if common_queue is None: + common_queue = arg.queue + elif common_queue != arg.queue: + raise ExecutionQueueInferenceError( + kernel_name=py_func_name, usmarray_argnum_list=[] + ) + + if common_queue is None: + raise ExecutionQueueInferenceError( + kernel_name=py_func_name, usmarray_argnum_list=None + ) + + def _compile_to_spirv( + self, kernel_library, kernel_fndesc, kernel_targetctx + ): + kernel_func = kernel_library.get_function(kernel_fndesc.llvm_func_name) + + # Create a spir_kernel wrapper function + kernel_fn = kernel_targetctx.prepare_spir_kernel( + kernel_func, kernel_fndesc.argtypes + ) + + # makes sure that the spir_func is completely inlined into the + # spir_kernel wrapper + kernel_library._optimize_final_module() + # Compiled the LLVM IR to SPIR-V + kernel_spirv_module = spirv_generator.llvm_to_spirv( + kernel_targetctx, + kernel_library._final_module, + kernel_library._final_module.as_bitcode(), + ) + return _KernelModule( + kernel_name=kernel_fn.name, kernel_bitcode=kernel_spirv_module + ) + + def compile(self, args, return_type): + kcres = self._compile_cached(args, return_type) + if kcres.status: + return kcres + + raise kcres.cres_or_error + + def _compile_cached( + self, args, return_type: types.Type + ) -> _KernelCompileResult: + """Compiles the kernel function to bitcode and generates a host-callable + wrapper to submit the kernel to a SYCL queue. + + The LLVM IR generated for the kernel function is available in the + CompileResult objected returned by + numba_dpex.core.pipeline.kernel_compiler.KernelCompiler. + + Once the kernel decorated function is compiled down to LLVM IR, the + following steps are performed: + + a) compile the IR into SPIR-V kernel + b) generate a host callable wrapper function that will create a + sycl::kernel_bundle from the SPIR-V and then submits the + kernel_bundle to a sycl::queue + c) create a cpython_wrapper_function for the host callable wrapper + function. + d) create a cfunc_wrapper_function to make the host callable wrapper + function callable inside another JIT-compiled function. + + Args: + args (tuple(types.Type)): A tuple of numba.core.Type instances each + representing the numba-inferred type of a kernel argument. + + return_type (types.Type): The numba-inferred type of the returned + value from the kernel. Should always be types.NoneType. + + Returns: + CompileResult: A CompileResult object storing the LLVM library for + the host-callable wrapper function. + """ + key = tuple(args), return_type + try: + return _KernelCompileResult(False, self._failed_cache[key], None) + except KeyError: + pass + + try: + kernel_cres: CompileResult = self._compile_core(args, return_type) + + kernel_library = kernel_cres.library + kernel_fndesc = kernel_cres.fndesc + kernel_targetctx = kernel_cres.target_context + + kernel_module = self._compile_to_spirv( + kernel_library, kernel_fndesc, kernel_targetctx + ) + + if config.DUMP_KERNEL_LLVM: + with open( + kernel_cres.fndesc.llvm_func_name + ".ll", + "w", + encoding="UTF-8", + ) as f: + f.write(kernel_cres.library._final_module) + + except errors.TypingError as e: + self._failed_cache[key] = e + return _KernelCompileResult(False, e, None) + + return _KernelCompileResult(True, kernel_cres, kernel_module) + + +class KernelDispatcher(Dispatcher): + """Dispatcher class designed to compile kernel decorated functions. The + dispatcher inherits the Numba Dispatcher class, but has a different + compilation strategy. Instead of compiling a kernel decorated function to + an executable binary, the dispatcher compiles it to SPIR-V and then caches + that SPIR-V bitcode. + + FIXME: Fix issues identified by pylint with this class. + https://github.com/IntelPython/numba-dpex/issues/1196 + + """ + + targetdescr = dpex_kernel_target + _fold_args = False + + Dispatcher._impl_kinds["kernel"] = _KernelCompiler + + def __init__( + self, + pyfunc, + debug_flags=None, + compile_flags=None, + specialization_sigs=None, + enable_cache=True, + locals={}, + targetoptions={}, + impl_kind="kernel", + pipeline_class=kernel_compiler.KernelCompiler, + ): + targetoptions["nopython"] = True + targetoptions["experimental"] = True + + self._kernel_name = pyfunc.__name__ + self.typingctx = self.targetdescr.typing_context + self.targetctx = self.targetdescr.target_context + + pysig = utils.pysignature(pyfunc) + arg_count = len(pysig.parameters) + + self.overloads = OrderedDict() + + can_fallback = not targetoptions.get("nopython", False) + + _DispatcherBase.__init__( + self, + arg_count, + pyfunc, + pysig, + can_fallback, + exact_match_required=False, + ) + + functools.update_wrapper(self, pyfunc) + + self.targetoptions = targetoptions + self.locals = locals + self._cache = NullCache() + compiler_class = self._impl_kinds[impl_kind] + self._impl_kind = impl_kind + self._compiler: _KernelCompiler = compiler_class( + pyfunc, self.targetdescr, targetoptions, locals, pipeline_class + ) + self._cache_hits = Counter() + self._cache_misses = Counter() + + self._type = types.Dispatcher(self) + self.typingctx.insert_global(self, self._type) + + # Remember target restriction + self._required_target_backend = targetoptions.get("target_backend") + + def typeof_pyval(self, val): + """ + Resolve the Numba type of Python value *val*. + This is called from numba._dispatcher as a fallback if the native code + cannot decide the type. + """ + # Not going through the resolve_argument_type() indirection + # can save a couple µs. + try: + tp = typeof(val, Purpose.argument) + if isinstance(tp, types.Array) and not isinstance(tp, DpnpNdArray): + raise UnsupportedKernelArgumentError( + type=str(type(val)), value=val + ) + except ValueError: + tp = types.pyobject + else: + if tp is None: + tp = types.pyobject + self._types_active_call.append(tp) + return tp + + def add_overload(self, cres): + args = tuple(cres.signature.args) + self.overloads[args] = cres.entry_point + + def compile(self, sig) -> _KernelCompileResult: + disp = self._get_dispatcher_for_current_target() + if disp is not self: + return disp.compile(sig) + + with ExitStack() as scope: + cres = None + + def cb_compiler(dur): + if cres is not None: + self._callback_add_compiler_timer(dur, cres) + + def cb_llvm(dur): + if cres is not None: + self._callback_add_llvm_timer(dur, cres) + + scope.enter_context( + ev.install_timer("numba:compiler_lock", cb_compiler) + ) + scope.enter_context(ev.install_timer("numba:llvm_lock", cb_llvm)) + scope.enter_context(global_compiler_lock) + + if not self._can_compile: + raise RuntimeError("compilation disabled") + # Use counter to track recursion compilation depth + with self._compiling_counter: + args, return_type = sigutils.normalize_signature(sig) + + try: + self._compiler.check_queue_equivalence_of_args( + self._kernel_name, args + ) + except ExecutionQueueInferenceError as eqie: + raise eqie + + # Don't recompile if signature already exists + existing = self.overloads.get(tuple(args)) + if existing is not None: + return existing + + # TODO: Enable caching + # Add code to enable on disk caching of a binary spirv kernel. + # Refer: https://github.com/IntelPython/numba-dpex/issues/1197 + self._cache_misses[sig] += 1 + ev_details = { + "dispatcher": self, + "args": args, + "return_type": return_type, + } + with ev.trigger_event("numba_dpex:compile", data=ev_details): + try: + kcres: _KernelCompileResult = self._compiler.compile( + args, return_type + ) + except errors.ForceLiteralArg as e: + + def folded(args, kws): + return self._compiler.fold_argument_types( + args, kws + )[1] + + raise e.bind_fold_arguments(folded) + self.add_overload(kcres.cres_or_error) + + # TODO: enable caching of kernel_module + # https://github.com/IntelPython/numba-dpex/issues/1197 + + return kcres.entry_point + + def __getitem__(self, args): + """Square-bracket notation for configuring launch arguments is not + supported. + """ + + raise NotImplementedError + + def __call__(self, *args, **kw_args): + """Functor to launch a kernel.""" + + raise NotImplementedError diff --git a/numba_dpex/experimental/launcher.py b/numba_dpex/experimental/launcher.py new file mode 100644 index 0000000000..ea242e1846 --- /dev/null +++ b/numba_dpex/experimental/launcher.py @@ -0,0 +1,398 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +"""Provides a helper function to call a numba_dpex.kernel decorated function +from either CPython or a numba_dpex.dpjit decorated function. +""" + +from collections import namedtuple +from typing import Union + +from llvmlite import ir as llvmir +from numba.core import cgutils, cpu, types +from numba.extending import intrinsic, overload + +from numba_dpex import config, dpjit +from numba_dpex.core.exceptions import UnreachableError +from numba_dpex.core.targets.kernel_target import DpexKernelTargetContext +from numba_dpex.core.types import DpnpNdArray, NdRangeType, RangeType +from numba_dpex.core.utils import kernel_launcher as kl +from numba_dpex.dpctl_iface import libsyclinterface_bindings as sycl +from numba_dpex.experimental.kernel_dispatcher import _KernelModule +from numba_dpex.utils import create_null_ptr + +_KernelArgs = namedtuple( + "_KernelArgs", + [ + "flattened_args_count", + "array_of_kernel_args", + "array_of_kernel_arg_types", + ], +) + +_KernelSubmissionArgs = namedtuple( + "_KernelSubmissionArgs", + [ + "kernel_ref", + "queue_ref", + "kernel_args", + "global_range_extents", + "local_range_extents", + ], +) + +_LLVMIRValuesForIndexSpace = namedtuple( + "_LLVMIRValuesForNdRange", ["global_range_extents", "local_range_extents"] +) + + +class _LaunchTrampolineFunctionBodyGenerator: + """ + Helper class to generate the LLVM IR for the launch_trampoline intrinsic. + """ + + def _get_num_flattened_kernel_args( + self, + kernel_targetctx: DpexKernelTargetContext, + kernel_argtys: tuple[types.Type, ...], + ): + num_flattened_kernel_args = 0 + for arg_type in kernel_argtys: + if isinstance(arg_type, DpnpNdArray): + datamodel = kernel_targetctx.data_model_manager.lookup(arg_type) + num_flattened_kernel_args += datamodel.flattened_field_count + elif arg_type in [types.complex64, types.complex128]: + num_flattened_kernel_args += 2 + else: + num_flattened_kernel_args += 1 + + return num_flattened_kernel_args + + def __init__( + self, + codegen_targetctx: cpu.CPUContext, + kernel_targetctx: DpexKernelTargetContext, + builder: llvmir.IRBuilder, + ): + self._cpu_codegen_targetctx = codegen_targetctx + self._kernel_targetctx = kernel_targetctx + self._builder = builder + self._klbuilder = kl.KernelLaunchIRBuilder(kernel_targetctx, builder) + + if config.DEBUG_KERNEL_LAUNCHER: + cgutils.printf( + self._builder, + "DPEX-DEBUG: Inside the kernel launcher function\n", + ) + + def insert_kernel_bitcode_as_byte_str( + self, kernel_module: _KernelModule + ) -> None: + """Inserts a global constant byte string in the current LLVM module to + store the passed in SPIR-V binary blob. + """ + return self._cpu_codegen_targetctx.insert_const_bytes( + self._builder.module, + bytes=kernel_module.kernel_bitcode, + ) + + def populate_kernel_args_and_argsty_arrays( + self, + kernel_argtys: tuple[types.Type, ...], + kernel_args: [llvmir.Instruction, ...], + ) -> _KernelArgs: + """Allocates an LLVM array value to store each flattened kernel arg and + another LLVM array to store the typeid for each flattened kernel arg. + The arrays are the populated with the LLVM value for each arg. + """ + num_flattened_kernel_args = self._get_num_flattened_kernel_args( + kernel_targetctx=self._kernel_targetctx, kernel_argtys=kernel_argtys + ) + + # Create LLVM values for the kernel args list and kernel arg types list + args_list = self._klbuilder.allocate_kernel_arg_array( + num_flattened_kernel_args + ) + args_ty_list = self._klbuilder.allocate_kernel_arg_ty_array( + num_flattened_kernel_args + ) + kernel_args_ptrs = [] + for arg in kernel_args: + ptr = self._builder.alloca(arg.type) + self._builder.store(arg, ptr) + kernel_args_ptrs.append(ptr) + + # Populate the args_list and the args_ty_list LLVM arrays + self._klbuilder.populate_kernel_args_and_args_ty_arrays( + callargs_ptrs=kernel_args_ptrs, + kernel_argtys=kernel_argtys, + args_list=args_list, + args_ty_list=args_ty_list, + datamodel_mgr=self._kernel_targetctx.data_model_manager, + ) + + if config.DEBUG_KERNEL_LAUNCHER: + cgutils.printf( + self._builder, + "DPEX-DEBUG: Populated kernel args and arg type arrays.\n", + ) + + return _KernelArgs( + flattened_args_count=num_flattened_kernel_args, + array_of_kernel_args=args_list, + array_of_kernel_arg_types=args_ty_list, + ) + + def get_queue_ref_val( + self, + kernel_argtys: [types.Type, ...], + kernel_args: [llvmir.Instruction, ...], + ): + """ + Get the sycl queue from the first DpnpNdArray argument. Prior passes + before lowering make sure that compute-follows-data is enforceable + for a specific call to a kernel. As such, at the stage of lowering + the queue from the first DpnpNdArray argument can be extracted. + """ + + for arg_num, argty in enumerate(kernel_argtys): + if isinstance(argty, DpnpNdArray): + llvm_val = kernel_args[arg_num] + datamodel = self._kernel_targetctx.data_model_manager.lookup( + argty + ) + sycl_queue_attr_pos = datamodel.get_field_position("sycl_queue") + ptr_to_queue_ref = self._builder.extract_value( + llvm_val, sycl_queue_attr_pos + ) + break + + return ptr_to_queue_ref + + def get_kernel(self, kernel_module, kbref): + """Returns the pointer to the sycl::kernel object in a passed in + sycl::kernel_bundle wrapper object. + """ + kernel_name = self._cpu_codegen_targetctx.insert_const_string( + self._builder.module, kernel_module.kernel_name + ) + return sycl.dpctl_kernel_bundle_get_kernel( + self._builder, kbref, kernel_name + ) + + def create_llvm_values_for_index_space( + self, + indexer_argty: Union[RangeType, NdRangeType], + index_space_arg: llvmir.BaseStructType, + ) -> _LLVMIRValuesForIndexSpace: + """Returns a list of LLVM IR Values that hold the unboxed extents of a + Python Range or NdRange object. + """ + ndim = indexer_argty.ndim + grange_extents = [] + lrange_extents = [] + datamodel = self._kernel_targetctx.data_model_manager.lookup( + indexer_argty + ) + + if isinstance(indexer_argty, RangeType): + for dim_num in range(ndim): + dim_pos = datamodel.get_field_position("dim" + str(dim_num)) + grange_extents.append( + self._builder.extract_value(index_space_arg, dim_pos) + ) + elif isinstance(indexer_argty, NdRangeType): + for dim_num in range(ndim): + gdim_pos = datamodel.get_field_position("gdim" + str(dim_num)) + grange_extents.append( + self._builder.extract_value(index_space_arg, gdim_pos) + ) + ldim_pos = datamodel.get_field_position("ldim" + str(dim_num)) + lrange_extents.append( + self._builder.extract_value(index_space_arg, ldim_pos) + ) + else: + raise UnreachableError + + return _LLVMIRValuesForIndexSpace( + global_range_extents=grange_extents, + local_range_extents=lrange_extents, + ) + + def create_kernel_bundle_from_spirv( + self, + queue_ref: llvmir.PointerType, + kernel_bc: llvmir.Constant, + kernel_bc_size_in_bytes: int, + ) -> llvmir.CallInstr: + """Calls DPCTLKernelBundle_CreateFromSpirv to create an opaque pointer + to a sycl::kernel_bundle from the SPIR-V generated for a kernel. + """ + dref = sycl.dpctl_queue_get_device(self._builder, queue_ref) + cref = sycl.dpctl_queue_get_context(self._builder, queue_ref) + args = [ + cref, + dref, + kernel_bc, + llvmir.Constant(llvmir.IntType(64), kernel_bc_size_in_bytes), + self._builder.load( + create_null_ptr(self._builder, self._cpu_codegen_targetctx) + ), + ] + kbref = sycl.dpctl_kernel_bundle_create_from_spirv(self._builder, *args) + sycl.dpctl_context_delete(self._builder, cref) + sycl.dpctl_device_delete(self._builder, dref) + + if config.DEBUG_KERNEL_LAUNCHER: + cgutils.printf( + self._builder, + "DPEX-DEBUG: Generated kernel_bundle from SPIR-V.\n", + ) + + return kbref + + def submit_and_wait(self, submit_call_args: _KernelSubmissionArgs) -> None: + """Generates LLVM IR CallInst to submit a kernel to specified SYCL queue + and then call DPCTLEvent_Wait on the returned event. + """ + if config.DEBUG_KERNEL_LAUNCHER: + cgutils.printf( + self._builder, "DPEX-DEBUG: Submit sync range kernel.\n" + ) + + eref = self._klbuilder.submit_sycl_kernel( + sycl_kernel_ref=submit_call_args.kernel_ref, + sycl_queue_ref=submit_call_args.queue_ref, + total_kernel_args=submit_call_args.kernel_args.flattened_args_count, + arg_list=submit_call_args.kernel_args.array_of_kernel_args, + arg_ty_list=submit_call_args.kernel_args.array_of_kernel_arg_types, + global_range=submit_call_args.global_range_extents, + local_range=submit_call_args.local_range_extents, + wait_before_return=False, + ) + if config.DEBUG_KERNEL_LAUNCHER: + cgutils.printf(self._builder, "DPEX-DEBUG: Wait on event.\n") + + sycl.dpctl_event_wait(self._builder, eref) + sycl.dpctl_event_delete(self._builder, eref) + + def cleanup( + self, + kernel_ref: llvmir.Instruction, + kernel_bundle_ref: llvmir.Instruction, + ) -> None: + """Generates calls to free up temporary resources that were allocated in + the launch_trampoline body. + """ + # Delete the kernel ref + sycl.dpctl_kernel_delete(self._builder, kernel_ref) + # Delete the kernel bundle pointer + sycl.dpctl_kernel_bundle_delete(self._builder, kernel_bundle_ref) + + +@intrinsic(target="cpu") +def intrin_launch_trampoline( + typingctx, kernel_fn, index_space, kernel_args # pylint: disable=W0613 +): + """Generates the body of the launch_trampoline overload. + + The intrinsic first compiles the kernel function to SPIRV, and then to a + sycl kernel bundle. The arguments to the kernel are also packed into + flattened arrays and the sycl queue to which the kernel will be submitted + extracted from the args. Finally, the actual kernel is extracted from the + kernel bundle and submitted to the sycl queue. + """ + kernel_args_list = list(kernel_args) + # signature of this intrinsic + sig = types.void(kernel_fn, index_space, kernel_args) + # signature of the kernel_fn + kernel_sig = types.void(*kernel_args_list) + kmodule: _KernelModule = kernel_fn.dispatcher.compile(kernel_sig) + kernel_targetctx = kernel_fn.dispatcher.targetctx + + def codegen(cgctx, builder, sig, llargs): + kernel_argtys = kernel_sig.args + kernel_args_unpacked = [] + for pos in range(len(kernel_args)): + kernel_args_unpacked.append(builder.extract_value(llargs[2], pos)) + + fn_body_gen = _LaunchTrampolineFunctionBodyGenerator( + codegen_targetctx=cgctx, + kernel_targetctx=kernel_targetctx, + builder=builder, + ) + + kernel_bc_byte_str = fn_body_gen.insert_kernel_bitcode_as_byte_str( + kmodule + ) + + populated_kernel_args = ( + fn_body_gen.populate_kernel_args_and_argsty_arrays( + kernel_argtys, kernel_args_unpacked + ) + ) + + qref = fn_body_gen.get_queue_ref_val( + kernel_argtys=kernel_argtys, + kernel_args=kernel_args_unpacked, + ) + + kbref = fn_body_gen.create_kernel_bundle_from_spirv( + queue_ref=qref, + kernel_bc=kernel_bc_byte_str, + kernel_bc_size_in_bytes=len(kmodule.kernel_bitcode), + ) + + kref = fn_body_gen.get_kernel(kmodule, kbref) + + index_space_values = fn_body_gen.create_llvm_values_for_index_space( + indexer_argty=sig.args[1], + index_space_arg=llargs[1], + ) + + submit_call_args = _KernelSubmissionArgs( + kernel_ref=kref, + queue_ref=qref, + kernel_args=populated_kernel_args, + global_range_extents=index_space_values.global_range_extents, + local_range_extents=index_space_values.local_range_extents, + ) + + fn_body_gen.submit_and_wait(submit_call_args) + + fn_body_gen.cleanup(kernel_bundle_ref=kbref, kernel_ref=kref) + + return sig, codegen + + +# pylint: disable=W0613 +def _launch_trampoline(kernel_fn, index_space, *kernel_args): + pass + + +@overload(_launch_trampoline, target="cpu") +def _ol_launch_trampoline(kernel_fn, index_space, *kernel_args): + def impl(kernel_fn, index_space, *kernel_args): + intrin_launch_trampoline( # pylint: disable=E1120 + kernel_fn, index_space, kernel_args + ) + + return impl + + +@dpjit +def call_kernel(kernel_fn, index_space, *kernel_args): + """Calls a numba_dpex.kernel decorated function from CPython or from another + dpjit function. + + Args: + kernel_fn (numba_dpex.experimental.KernelDispatcher): A + numba_dpex.kernel decorated function that is compiled to a + KernelDispatcher by numba_dpex. + index_space (Range | NdRange): A numba_dpex.Range or numba_dpex.NdRange + type object that specifies the index space for the kernel. + kernel_args : List of objects that are passed to the numba_dpex.kernel + decorated function. + """ + _launch_trampoline(kernel_fn, index_space, *kernel_args) diff --git a/numba_dpex/experimental/models.py b/numba_dpex/experimental/models.py new file mode 100644 index 0000000000..588d44ed19 --- /dev/null +++ b/numba_dpex/experimental/models.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +"""Provides Numba datamodel for the numba_dpex types introduced in the +numba_dpex.experimental module. +""" + +from numba.core.datamodel import models +from numba.core.extending import register_model + +from numba_dpex.core.datamodel.models import dpex_data_model_manager as dmm + +from .types import KernelDispatcherType + +# Register the types and datamodel in the DpexKernelTargetContext +dmm.register(KernelDispatcherType, models.OpaqueModel) + +# Register the types and datamodel in the DpexTargetContext +register_model(KernelDispatcherType)(models.OpaqueModel) diff --git a/numba_dpex/experimental/types.py b/numba_dpex/experimental/types.py new file mode 100644 index 0000000000..73c901e3cb --- /dev/null +++ b/numba_dpex/experimental/types.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +"""Experimental types that will eventually move to numba_dpex.core.types +""" + +from numba.core import types + + +class KernelDispatcherType(types.Dispatcher): + """The type of KernelDispatcher dispatchers""" + + def cast_python_value(self, args): + raise NotImplementedError diff --git a/numba_dpex/tests/experimental/__init__.py b/numba_dpex/tests/experimental/__init__.py new file mode 100644 index 0000000000..3a217e6325 --- /dev/null +++ b/numba_dpex/tests/experimental/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/numba_dpex/tests/experimental/test_exec_queue_inference.py b/numba_dpex/tests/experimental/test_exec_queue_inference.py new file mode 100644 index 0000000000..35dc300ae0 --- /dev/null +++ b/numba_dpex/tests/experimental/test_exec_queue_inference.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + + +import dpctl +import dpnp +import pytest + +import numba_dpex.experimental as exp_dpex +from numba_dpex import Range +from numba_dpex.core.exceptions import ExecutionQueueInferenceError + + +@exp_dpex.kernel( + release_gil=False, + no_compile=True, + no_cpython_wrapper=True, + no_cfunc_wrapper=True, +) +def add(a, b, c): + c[0] = b[0] + a[0] + + +def test_successful_execution_queue_inference(): + """ + Tests if KernelDispatcher successfully infers the execution queue for the + kernel. + """ + + q = dpctl.SyclQueue() + a = dpnp.ones(100, sycl_queue=q) + b = dpnp.ones_like(a, sycl_queue=q) + c = dpnp.zeros_like(a, sycl_queue=q) + r = Range(100) + + # FIXME: This test fails unexpectedly if the NUMBA_CAPTURED_ERRORS is set + # to "new_style". + # Refer: https://github.com/IntelPython/numba-dpex/issues/1195 + try: + exp_dpex.call_kernel(add, r, a, b, c) + except: + pytest.fail("Unexpected error when calling kernel") + + assert c[0] == b[0] + a[0] + + +def test_execution_queue_inference_error(): + """ + Tests if KernelDispatcher successfully raised ExecutionQueueInferenceError + when dpnp.ndarray arguments do not share the same dpctl.SyclQueue + instance. + """ + + q1 = dpctl.SyclQueue() + q2 = dpctl.SyclQueue() + a = dpnp.ones(100, sycl_queue=q1) + b = dpnp.ones_like(a, sycl_queue=q2) + c = dpnp.zeros_like(a, sycl_queue=q1) + r = Range(100) + + from numba.core import config + + current_captured_error_style = config.CAPTURED_ERRORS + config.CAPTURED_ERRORS = "new_style" + + with pytest.raises(ExecutionQueueInferenceError): + exp_dpex.call_kernel(add, r, a, b, c) + + config.CAPTURED_ERRORS = current_captured_error_style + + +def test_error_when_no_array_args(): + """ + Tests if KernelDispatcher successfully raised ExecutionQueueInferenceError + when no dpnp.ndarray arguments were passed to a kernel. + """ + a = 1 + b = 2 + c = 3 + r = Range(100) + + from numba.core import config + + current_captured_error_style = config.CAPTURED_ERRORS + config.CAPTURED_ERRORS = "new_style" + + with pytest.raises(ExecutionQueueInferenceError): + exp_dpex.call_kernel(add, r, a, b, c) + + config.CAPTURED_ERRORS = current_captured_error_style diff --git a/numba_dpex/tests/experimental/test_kernel_dispatcher.py b/numba_dpex/tests/experimental/test_kernel_dispatcher.py new file mode 100644 index 0000000000..abda1e1ed3 --- /dev/null +++ b/numba_dpex/tests/experimental/test_kernel_dispatcher.py @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpctl +import dpnp + +import numba_dpex.experimental as exp_dpex +from numba_dpex import NdRange, Range, dpjit + + +@exp_dpex.kernel( + release_gil=False, + no_compile=True, + no_cpython_wrapper=True, + no_cfunc_wrapper=True, +) +def add(a, b, c): + c[0] = b[0] + a[0] + + +@exp_dpex.kernel( + release_gil=False, + no_compile=True, + no_cpython_wrapper=True, + no_cfunc_wrapper=True, +) +def sq(a, b): + a[0] = b[0] * b[0] + + +def test_call_kernel_from_cpython(): + """ + Tests if we can call a kernel function from CPython using the call_kernel + dpjit function. + """ + + q = dpctl.SyclQueue() + a = dpnp.ones(100, sycl_queue=q) + b = dpnp.ones_like(a, sycl_queue=q) + c = dpnp.zeros_like(a, sycl_queue=q) + r = Range(100) + ndr = NdRange(global_size=(100,), local_size=(1,)) + + exp_dpex.call_kernel(add, r, a, b, c) + + assert c[0] == b[0] + a[0] + + exp_dpex.call_kernel(add, ndr, a, b, c) + + assert c[0] == b[0] + a[0] + + +def test_call_kernel_from_dpjit(): + """ + Tests if we can call a kernel function from a dpjit function using the + call_kernel dpjit function. + """ + + @dpjit + def range_kernel_caller(q, a, b, c): + r = Range(100) + exp_dpex.call_kernel(add, r, a, b, c) + return c + + @dpjit + def ndrange_kernel_caller(q, a, b, c): + gr = Range(100) + lr = Range(1) + ndr = NdRange(gr, lr) + exp_dpex.call_kernel(add, ndr, a, b, c) + return c + + q = dpctl.SyclQueue() + a = dpnp.ones(100, sycl_queue=q) + b = dpnp.ones_like(a, sycl_queue=q) + c = dpnp.zeros_like(a, sycl_queue=q) + + range_kernel_caller(q, a, b, c) + + assert c[0] == b[0] + a[0] + + ndrange_kernel_caller(q, a, b, c) + + assert c[0] == b[0] + a[0] + + +def test_call_multiple_kernels(): + """ + Tests if the call_kernel dpjit function supports calling different types of + kernel with different number of arguments. + """ + q = dpctl.SyclQueue() + a = dpnp.ones(100, sycl_queue=q) + b = dpnp.ones_like(a, sycl_queue=q) + c = dpnp.zeros_like(a, sycl_queue=q) + r = Range(100) + + exp_dpex.call_kernel(add, r, a, b, c) + + assert c[0] == b[0] + a[0] + + exp_dpex.call_kernel(sq, r, a, c) + + assert a[0] == c[0] * c[0] diff --git a/pyproject.toml b/pyproject.toml index fc81e2609f..7cd918c47b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,11 @@ style = "pep440" versionfile_source = "numba_dpex/_version.py" parentdir_prefix = "" +[tool.pylint] +disable = [ + "fixme" +] + [build-system] requires = [ "setuptools>=42",