Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

An experimental kernel dispatcher for numba_dpex.kernel decorator #1178

Merged
merged 16 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,19 @@ on:
jobs:
pre-commit:
runs-on: ubuntu-20.04
defaults:
run:
shell: bash -el {0}
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
- uses: conda-incubator/setup-miniconda@v2
with:
python-version: '3.11'
- uses: pre-commit/[email protected]
activate-environment: "coverage"
channel-priority: "disabled"
environment-file: environment/pre-commit.yml
- uses: actions/cache@v3
with:
path: ~/.cache/pre-commit
key: pre-commit-3|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
- run: pre-commit run --show-diff-on-failure --color=always --all-files
14 changes: 14 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,17 @@ repos:
args: ["-i"]
exclude: "numba_dpex/dpnp_iface"
types_or: [c++, c]
- repo: local
hooks:
- id: pylint
name: pylint
entry: pylint
files: ^numba_dpex/experimental
language: system
types: [python]
require_serial: true
args:
[
"-rn", # Only display messages
"-sn", # Don't display the score
]
25 changes: 25 additions & 0 deletions environment/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: dev
channels:
- dppy/label/dev
- numba
- intel
- conda-forge
- nodefaults
dependencies:
- libffi
- gxx_linux-64
- dpcpp_linux-64
- numba==0.58*
- dpctl
- dpnp
- dpcpp-llvm-spirv
- opencl_rt
- coverage
- pytest
- pytest-cov
- pytest-xdist
- pexpect
- scikit-build>=0.15*
- cmake>=3.26*
- pre-commit
- pylint
13 changes: 9 additions & 4 deletions numba_dpex/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,15 @@ def __getattr__(name):
"NUMBA_DPEX_DEBUGINFO", int, config.DEBUGINFO_DEFAULT
)

# Emit LLVM assembly language format(.ll)
DUMP_KERNEL_LLVM = _readenv(
"NUMBA_DPEX_DUMP_KERNEL_LLVM", int, config.DUMP_OPTIMIZED
)
# Emit LLVM IR generated for kernel decorated function
DUMP_KERNEL_LLVM = _readenv("NUMBA_DPEX_DUMP_KERNEL_LLVM", int, 0)

# Emit LLVM module generated to launch a kernel decorated function
DUMP_KERNEL_LAUNCHER = _readenv("NUMBA_DPEX_DUMP_KERNEL_LAUNCHER", int, 0)

# Enables debug printf messages inside the kernel launcher module generated for
# a kernel decorated function
DEBUG_KERNEL_LAUNCHER = _readenv("NUMBA_DPEX_DEBUG_KERNEL_LAUNCHER", int, 0)

# configs for caching
# To see the debug messages for the caching.
Expand Down
2 changes: 2 additions & 0 deletions numba_dpex/core/descriptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ def _inherit_if_not_set(flags, options, name, default=targetconfig._NotSet):
class DpexTargetOptions(CPUTargetOptions):
experimental = _option_mapping("experimental")
release_gil = _option_mapping("release_gil")
no_compile = _option_mapping("no_compile")

def finalize(self, flags, options):
super().finalize(flags, options)
_inherit_if_not_set(flags, options, "experimental", False)
_inherit_if_not_set(flags, options, "release_gil", False)
_inherit_if_not_set(flags, options, "no_compile", True)


class DpexKernelTarget(TargetDescriptor):
Expand Down
6 changes: 6 additions & 0 deletions numba_dpex/core/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@ def __init__(self, kernel_name, *, usmarray_argnum_list) -> None:
f"usm_ndarray arguments {usmarray_args} were not allocated "
"on the same queue."
)
else:
self.message = (
f'Execution queue for kernel "{kernel_name}" could '
"be deduced using compute follows data programming model. The "
"kernel has no USMNdArray argument."
)
super().__init__(self.message)


Expand Down
146 changes: 90 additions & 56 deletions numba_dpex/core/parfors/parfor_lowerer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
# SPDX-License-Identifier: Apache-2.0

import copy
from collections import namedtuple

from llvmlite import ir as llvmir
from numba.core import ir, types
from numba.core import cgutils, ir, types
from numba.parfors.parfor import (
find_potential_aliases_parfor,
get_parfor_outputs,
Expand All @@ -27,6 +28,12 @@
create_reduction_remainder_kernel_for_parfor,
)

_KernelArgs = namedtuple(
"_KernelArgs",
["num_flattened_args", "arg_vals", "arg_types"],
)


# A global list of kernels to keep the objects alive indefinitely.
keep_alive_kernels = []

Expand Down Expand Up @@ -84,21 +91,7 @@ class ParforLowerImpl:
for a parfor and submits it to a queue.
"""

def _get_exec_queue(self, kernel_fn, lowerer):
"""Creates a stack variable storing the sycl queue pointer used to
launch the kernel function.
"""
self.kernel_builder = KernelLaunchIRBuilder(
lowerer.context, lowerer.builder, kernel_fn.kernel.addressof_ref()
)

# Create a local variable storing a pointer to a DPCTLSyclQueueRef
# pointer.
self.curr_queue = self.kernel_builder.get_queue(
exec_queue=kernel_fn.queue
)

def _build_kernel_arglist(self, kernel_fn, lowerer):
def _build_kernel_arglist(self, kernel_fn, lowerer, kernel_builder):
"""Creates local variables for all the arguments and the argument types
that are passes to the kernel function.

Expand All @@ -110,39 +103,43 @@ def _build_kernel_arglist(self, kernel_fn, lowerer):
AssertionError: If the LLVM IR Value for an argument defined in
Numba IR is not found.
"""
self.num_flattened_args = 0
num_flattened_args = 0

# Compute number of args to be passed to the kernel. Note that the
# actual number of kernel arguments is greater than the count of
# kernel_fn.kernel_args as arrays get flattened.
for arg_type in kernel_fn.kernel_arg_types:
if isinstance(arg_type, DpnpNdArray):
datamodel = dpex_dmm.lookup(arg_type)
self.num_flattened_args += datamodel.flattened_field_count
num_flattened_args += datamodel.flattened_field_count
elif arg_type == types.complex64 or arg_type == types.complex128:
self.num_flattened_args += 2
num_flattened_args += 2
else:
self.num_flattened_args += 1
num_flattened_args += 1

# Create LLVM values for the kernel args list and kernel arg types list
self.args_list = self.kernel_builder.allocate_kernel_arg_array(
self.num_flattened_args
)
self.args_ty_list = self.kernel_builder.allocate_kernel_arg_ty_array(
self.num_flattened_args
args_list = kernel_builder.allocate_kernel_arg_array(num_flattened_args)
args_ty_list = kernel_builder.allocate_kernel_arg_ty_array(
num_flattened_args
)
callargs_ptrs = []
for arg in kernel_fn.kernel_args:
callargs_ptrs.append(_getvar(lowerer, arg))

self.kernel_builder.populate_kernel_args_and_args_ty_arrays(
kernel_builder.populate_kernel_args_and_args_ty_arrays(
kernel_argtys=kernel_fn.kernel_arg_types,
callargs_ptrs=callargs_ptrs,
args_list=self.args_list,
args_ty_list=self.args_ty_list,
args_list=args_list,
args_ty_list=args_ty_list,
datamodel_mgr=dpex_dmm,
)

return _KernelArgs(
num_flattened_args=num_flattened_args,
arg_vals=args_list,
arg_types=args_ty_list,
)

def _submit_parfor_kernel(
self,
lowerer,
Expand All @@ -156,9 +153,11 @@ def _submit_parfor_kernel(
# Ensure that the Python arguments are kept alive for the duration of
# the kernel execution
keep_alive_kernels.append(kernel_fn.kernel)
kernel_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder)

ptr_to_queue_ref = kernel_builder.get_queue(exec_queue=kernel_fn.queue)
args = self._build_kernel_arglist(kernel_fn, lowerer, kernel_builder)

self._get_exec_queue(kernel_fn, lowerer)
self._build_kernel_arglist(kernel_fn, lowerer)
# Create a global range over which to submit the kernel based on the
# loop_ranges of the parfor
global_range = []
Expand All @@ -178,18 +177,26 @@ def _submit_parfor_kernel(

local_range = []

kernel_ref_addr = kernel_fn.kernel.addressof_ref()
kernel_ref = lowerer.builder.inttoptr(
lowerer.context.get_constant(types.uintp, kernel_ref_addr),
cgutils.voidptr_t,
)
curr_queue_ref = lowerer.builder.load(ptr_to_queue_ref)

# Submit a synchronous kernel
self.kernel_builder.submit_sync_kernel(
self.curr_queue,
self.num_flattened_args,
self.args_list,
self.args_ty_list,
global_range,
local_range,
kernel_builder.submit_sycl_kernel(
sycl_kernel_ref=kernel_ref,
sycl_queue_ref=curr_queue_ref,
total_kernel_args=args.num_flattened_args,
arg_list=args.arg_vals,
arg_ty_list=args.arg_types,
global_range=global_range,
local_range=local_range,
)

# At this point we can free the DPCTLSyclQueueRef (curr_queue)
self.kernel_builder.free_queue(sycl_queue_val=self.curr_queue)
kernel_builder.free_queue(ptr_to_sycl_queue_ref=ptr_to_queue_ref)

def _submit_reduction_main_parfor_kernel(
self,
Expand All @@ -204,9 +211,11 @@ def _submit_reduction_main_parfor_kernel(
# Ensure that the Python arguments are kept alive for the duration of
# the kernel execution
keep_alive_kernels.append(kernel_fn.kernel)
kernel_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder)

ptr_to_queue_ref = kernel_builder.get_queue(exec_queue=kernel_fn.queue)

self._get_exec_queue(kernel_fn, lowerer)
self._build_kernel_arglist(kernel_fn, lowerer)
args = self._build_kernel_arglist(kernel_fn, lowerer, kernel_builder)
# Create a global range over which to submit the kernel based on the
# loop_ranges of the parfor
global_range = []
Expand All @@ -220,16 +229,27 @@ def _submit_reduction_main_parfor_kernel(
_load_range(lowerer, reductionHelper.work_group_size)
)

kernel_ref_addr = kernel_fn.kernel.addressof_ref()
kernel_ref = lowerer.builder.inttoptr(
lowerer.context.get_constant(types.uintp, kernel_ref_addr),
cgutils.voidptr_t,
)
curr_queue_ref = lowerer.builder.load(ptr_to_queue_ref)

# Submit a synchronous kernel
self.kernel_builder.submit_sync_kernel(
self.curr_queue,
self.num_flattened_args,
self.args_list,
self.args_ty_list,
global_range,
local_range,
kernel_builder.submit_sycl_kernel(
sycl_kernel_ref=kernel_ref,
sycl_queue_ref=curr_queue_ref,
total_kernel_args=args.num_flattened_args,
arg_list=args.arg_vals,
arg_ty_list=args.arg_types,
global_range=global_range,
local_range=local_range,
)

# At this point we can free the DPCTLSyclQueueRef (curr_queue)
kernel_builder.free_queue(ptr_to_sycl_queue_ref=ptr_to_queue_ref)

def _submit_reduction_remainder_parfor_kernel(
self,
lowerer,
Expand All @@ -243,8 +263,11 @@ def _submit_reduction_remainder_parfor_kernel(
# the kernel execution
keep_alive_kernels.append(kernel_fn.kernel)

self._get_exec_queue(kernel_fn, lowerer)
self._build_kernel_arglist(kernel_fn, lowerer)
kernel_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder)

ptr_to_queue_ref = kernel_builder.get_queue(exec_queue=kernel_fn.queue)

args = self._build_kernel_arglist(kernel_fn, lowerer, kernel_builder)
# Create a global range over which to submit the kernel based on the
# loop_ranges of the parfor
global_range = []
Expand All @@ -255,16 +278,27 @@ def _submit_reduction_remainder_parfor_kernel(

local_range = []

kernel_ref_addr = kernel_fn.kernel.addressof_ref()
kernel_ref = lowerer.builder.inttoptr(
lowerer.context.get_constant(types.uintp, kernel_ref_addr),
cgutils.voidptr_t,
)
curr_queue_ref = lowerer.builder.load(ptr_to_queue_ref)

# Submit a synchronous kernel
self.kernel_builder.submit_sync_kernel(
self.curr_queue,
self.num_flattened_args,
self.args_list,
self.args_ty_list,
global_range,
local_range,
kernel_builder.submit_sycl_kernel(
sycl_kernel_ref=kernel_ref,
sycl_queue_ref=curr_queue_ref,
total_kernel_args=args.num_flattened_args,
arg_list=args.arg_vals,
arg_ty_list=args.arg_types,
global_range=global_range,
local_range=local_range,
)

# At this point we can free the DPCTLSyclQueueRef (curr_queue)
kernel_builder.free_queue(ptr_to_sycl_queue_ref=ptr_to_queue_ref)

def _reduction_codegen(
self,
parfor,
Expand Down
8 changes: 2 additions & 6 deletions numba_dpex/core/parfors/reduction_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,11 +395,7 @@ def work_group_size(self):

def copy_final_sum_to_host(self, parfor_kernel):
lowerer = self.lowerer
ir_builder = KernelLaunchIRBuilder(
lowerer.context,
lowerer.builder,
parfor_kernel.kernel.addressof_ref(),
)
ir_builder = KernelLaunchIRBuilder(lowerer.context, lowerer.builder)

# Create a local variable storing a pointer to a DPCTLSyclQueueRef
# pointer.
Expand Down Expand Up @@ -447,4 +443,4 @@ def copy_final_sum_to_host(self, parfor_kernel):
sycl.dpctl_event_wait(builder, event_ref)
sycl.dpctl_event_delete(builder, event_ref)

ir_builder.free_queue(sycl_queue_val=curr_queue)
ir_builder.free_queue(ptr_to_sycl_queue_ref=curr_queue)
Loading
Loading