diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 05c20a6614..37018dde6c 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -523,7 +523,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ThreadsPerBlock, BlocksPerGrid)); // Set node param structure with the kernel related data - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); CUDA_KERNEL_NODE_PARAMS NodeParams = {}; NodeParams.func = CuFunc; NodeParams.gridDimX = BlocksPerGrid[0]; @@ -533,7 +533,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( NodeParams.blockDimY = ThreadsPerBlock[1]; NodeParams.blockDimZ = ThreadsPerBlock[2]; NodeParams.sharedMemBytes = LocalSize; - NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.kernelParams = const_cast(ArgPointers.data()); // Create and add an new kernel node to the Cuda graph UR_CHECK_ERROR(cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph, @@ -1398,7 +1398,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDimZ = ThreadsPerBlock[2]; Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize(); Params.kernelParams = - const_cast(KernelCommandHandle->Kernel->getArgIndices().data()); + const_cast(KernelCommandHandle->Kernel->getArgPointers().data()); CUgraphNode Node = KernelCommandHandle->Node; CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec; diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 2a4a2cf54f..71c4340456 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -492,7 +492,7 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgIndices = hKernel->getArgPointers(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, @@ -680,7 +680,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( UR_CHECK_ERROR(RetImplEvent->start()); } - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); CUlaunchConfig launch_config; launch_config.gridDimX = BlocksPerGrid[0]; @@ -696,7 +696,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.numAttrs = launch_attribute.size(); UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, - const_cast(ArgIndices.data()), + const_cast(ArgPointers.data()), nullptr)); if (phEvent) { diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index a6194e9a57..f299714b02 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -66,8 +66,8 @@ struct ur_kernel_handle_t_ { args_t Storage; /// Aligned size of each parameter, including padding. args_size_t ParamSizes; - /// Byte offset into /p Storage allocation for each parameter. - args_index_t Indices; + /// Byte offset into /p Storage allocation for each argument. + args_index_t ArgPointers; /// Position in the Storage array where the next argument should added. size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has @@ -92,21 +92,23 @@ struct ur_kernel_handle_t_ { std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; arguments() { - // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); + // Place the implicit offset index at the end of the ArgPointers + // collection. + ArgPointers.emplace_back(&ImplicitOffsetArgs); } /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. + /// Implicit offset argument is kept at the back of the ArgPointers + /// collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { // Expand storage to accommodate this Index if needed. - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { // Move implicit offset argument index with the end - Indices.resize(Index + 2, Indices.back()); + ArgPointers.resize(Index + 2, ArgPointers.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); @@ -117,13 +119,13 @@ struct ur_kernel_handle_t_ { if (ParamSizes[Index] == 0) { ParamSizes[Index] = Size; std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; + ArgPointers[Index] = &Storage[InsertPos]; AlignedLocalMemSize[Index] = LocalSize; InsertPos += Size; } // Otherwise, update the existing argument. else { - std::memcpy(Indices[Index], Arg, Size); + std::memcpy(ArgPointers[Index], Arg, Size); AlignedLocalMemSize[Index] = LocalSize; assert(Size == ParamSizes[Index]); } @@ -138,7 +140,7 @@ struct ur_kernel_handle_t_ { std::pair calcAlignedLocalArgument(size_t Index, size_t Size) { // Store the unpadded size of the local argument - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } @@ -168,10 +170,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - // Iterate over all existing local argument which follows StartIndex + // Iterate over each existing local argument which follows StartIndex // index, update the offset and pointer into the kernel local memory. void updateLocalArgOffset(size_t StartIndex) { - const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg + const size_t NumArgs = + ArgPointers.size() - 1; // Accounts for implicit arg for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { @@ -187,7 +190,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, + std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } @@ -235,7 +238,7 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - const args_index_t &getIndices() const noexcept { return Indices; } + const args_index_t &getArgPointers() const noexcept { return ArgPointers; } uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), @@ -306,7 +309,7 @@ struct ur_kernel_handle_t_ { /// real one required by the kernel, since this cannot be queried from /// the CUDA Driver API uint32_t getNumArgs() const noexcept { - return static_cast(Args.Indices.size() - 1); + return static_cast(Args.ArgPointers.size() - 1); } void setKernelArg(int Index, size_t Size, const void *Arg) { @@ -321,8 +324,8 @@ struct ur_kernel_handle_t_ { return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); + const arguments::args_index_t &getArgPointers() const { + return Args.getArgPointers(); } void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); } diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 09c59bb9f7..887eb75287 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -378,7 +378,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid)); // Set node param structure with the kernel related data - auto &ArgIndices = hKernel->getArgIndices(); + auto &ArgPointers = hKernel->getArgPointers(); hipKernelNodeParams NodeParams; NodeParams.func = HIPFunc; NodeParams.gridDim.x = BlocksPerGrid[0]; @@ -388,7 +388,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( NodeParams.blockDim.y = ThreadsPerBlock[1]; NodeParams.blockDim.z = ThreadsPerBlock[2]; NodeParams.sharedMemBytes = LocalSize; - NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.kernelParams = const_cast(ArgPointers.data()); NodeParams.extra = nullptr; // Create and add an new kernel node to the HIP graph @@ -1098,7 +1098,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDim.z = ThreadsPerBlock[2]; Params.sharedMemBytes = hCommand->Kernel->getLocalSize(); Params.kernelParams = - const_cast(hCommand->Kernel->getArgIndices().data()); + const_cast(hCommand->Kernel->getArgPointers().data()); hipGraphNode_t Node = hCommand->Node; hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec; diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 8c7c1c617d..849369de4b 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -308,7 +308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } - auto ArgIndices = hKernel->getArgIndices(); + auto ArgPointers = hKernel->getArgPointers(); // If migration of mem across buffer is needed, an event must be associated // with this command, implicitly if phEvent is nullptr @@ -322,7 +322,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(hipModuleLaunchKernel( HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], - hKernel->getLocalSize(), HIPStream, ArgIndices.data(), nullptr)); + hKernel->getLocalSize(), HIPStream, ArgPointers.data(), nullptr)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index 61dd89cc99..5ec51e7fa4 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -61,8 +61,8 @@ struct ur_kernel_handle_t_ { args_t Storage; /// Aligned size of each parameter, including padding. args_size_t ParamSizes; - /// Byte offset into /p Storage allocation for each parameter. - args_index_t Indices; + /// Byte offset into /p Storage allocation for each argument. + args_index_t ArgPointers; /// Position in the Storage array where the next argument should added. size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has @@ -87,20 +87,21 @@ struct ur_kernel_handle_t_ { arguments() { // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); + ArgPointers.emplace_back(&ImplicitOffsetArgs); } /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. + /// Implicit offset argument is kept at the back of the ArgPointers + /// collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { // Expand storage to accommodate this Index if needed. - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { // Move implicit offset argument index with the end - Indices.resize(Index + 2, Indices.back()); + ArgPointers.resize(Index + 2, ArgPointers.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); @@ -111,13 +112,13 @@ struct ur_kernel_handle_t_ { if (ParamSizes[Index] == 0) { ParamSizes[Index] = Size; std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; + ArgPointers[Index] = &Storage[InsertPos]; AlignedLocalMemSize[Index] = LocalSize; InsertPos += Size; } // Otherwise, update the existing argument. else { - std::memcpy(Indices[Index], Arg, Size); + std::memcpy(ArgPointers[Index], Arg, Size); AlignedLocalMemSize[Index] = LocalSize; assert(Size == ParamSizes[Index]); } @@ -132,7 +133,7 @@ struct ur_kernel_handle_t_ { std::pair calcAlignedLocalArgument(size_t Index, size_t Size) { // Store the unpadded size of the local argument - if (Index + 2 > Indices.size()) { + if (Index + 2 > ArgPointers.size()) { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } @@ -161,10 +162,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - // Iterate over all existing local argument which follows StartIndex + // Iterate over each existing local argument which follows StartIndex // index, update the offset and pointer into the kernel local memory. void updateLocalArgOffset(size_t StartIndex) { - const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg + const size_t NumArgs = + ArgPointers.size() - 1; // Accounts for implicit arg for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { @@ -180,7 +182,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, + std::memcpy(ArgPointers[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } @@ -219,7 +221,7 @@ struct ur_kernel_handle_t_ { std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); } - const args_index_t &getIndices() const noexcept { return Indices; } + const args_index_t &getArgPointers() const noexcept { return ArgPointers; } uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), @@ -276,7 +278,7 @@ struct ur_kernel_handle_t_ { /// offset. Note this only returns the current known number of arguments, /// not the real one required by the kernel, since this cannot be queried /// from the HIP Driver API - uint32_t getNumArgs() const noexcept { return Args.Indices.size() - 1; } + uint32_t getNumArgs() const noexcept { return Args.ArgPointers.size() - 1; } void setKernelArg(int Index, size_t Size, const void *Arg) { Args.addArg(Index, Size, Arg); @@ -290,8 +292,8 @@ struct ur_kernel_handle_t_ { return Args.setImplicitOffset(Size, ImplicitOffset); } - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); + const arguments::args_index_t &getArgPointers() const { + return Args.getArgPointers(); } uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } diff --git a/test/adapters/cuda/kernel_tests.cpp b/test/adapters/cuda/kernel_tests.cpp index 0f7f3351fe..7b83459c5f 100644 --- a/test/adapters/cuda/kernel_tests.cpp +++ b/test/adapters/cuda/kernel_tests.cpp @@ -153,7 +153,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSimple) { int number = 10; ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); - const auto &kernelArgs = kernel->getArgIndices(); + const auto &kernelArgs = kernel->getArgPointers(); ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); int storedValue = *static_cast(kernelArgs[0]); @@ -175,7 +175,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { int number = 10; ASSERT_SUCCESS(urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); - const auto &kernelArgs = kernel->getArgIndices(); + const auto &kernelArgs = kernel->getArgPointers(); ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); int storedValue = *static_cast(kernelArgs[0]); ASSERT_EQ(storedValue, number); @@ -183,7 +183,7 @@ TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { int otherNumber = 934; ASSERT_SUCCESS( urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &otherNumber)); - const auto kernelArgs2 = kernel->getArgIndices(); + const auto kernelArgs2 = kernel->getArgPointers(); ASSERT_EQ(kernelArgs2.size(), 1 + NumberOfImplicitArgsCUDA); storedValue = *static_cast(kernelArgs2[0]); ASSERT_EQ(storedValue, otherNumber);