Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Host logging #135

Merged
merged 10 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ option(PORTFFT_CLANG_TIDY "Enable clang-tidy checks on portFFT source when build
option(PORTFFT_CLANG_TIDY_AUTOFIX "Attempt to fix defects found by clang-tidy" OFF)
option(PORTFFT_LOG_DUMPS "Whether to enable logging of data dumps" OFF)
option(PORTFFT_LOG_TRANSFERS "Whether to enable logging of memory transfers" OFF)
option(PORTFFT_LOG_TRACE "Whether to enable tracing of function calls" OFF)
option(PORTFFT_LOG_TRACES "Whether to enable tracing of function calls" OFF)
option(PORTFFT_LOG_WARNINGS "Whether to enable logging of warnings" ON)
set(PORTFFT_REGISTERS_PER_WI 128 CACHE STRING "How many 32b registers can be allocated per work item on the target device")
set(PORTFFT_SUBGROUP_SIZES 32 CACHE STRING "Comma separated list of subgroup sizes to compile for. The first size supported by the device will be used.")
set(PORTFFT_VEC_LOAD_BYTES 16 CACHE STRING "Number of consecutive bytes each work item should load at once.")
Expand Down Expand Up @@ -92,21 +93,24 @@ if(${PORTFFT_ENABLE_OOP_BUILDS})
target_compile_definitions(portfft INTERFACE PORTFFT_ENABLE_OOP_BUILDS)
endif()

set(PORTFFT_LOG OFF)
set(PORTFFT_KERNEL_LOG OFF)
if(${PORTFFT_LOG_DUMPS})
target_compile_definitions(portfft INTERFACE PORTFFT_LOG_DUMPS)
set(PORTFFT_LOG ON)
set(PORTFFT_KERNEL_LOG ON)
endif()
if(${PORTFFT_LOG_TRANSFERS})
target_compile_definitions(portfft INTERFACE PORTFFT_LOG_TRANSFERS)
set(PORTFFT_LOG ON)
set(PORTFFT_KERNEL_LOG ON)
endif()
if(${PORTFFT_LOG_TRACE})
target_compile_definitions(portfft INTERFACE PORTFFT_LOG_TRACE)
set(PORTFFT_LOG ON)
if(${PORTFFT_LOG_TRACES})
target_compile_definitions(portfft INTERFACE PORTFFT_LOG_TRACES)
set(PORTFFT_KERNEL_LOG ON)
endif()
if(${PORTFFT_LOG})
target_compile_definitions(portfft INTERFACE PORTFFT_LOG)
if(${PORTFFT_LOG_WARNINGS})
target_compile_definitions(portfft INTERFACE PORTFFT_LOG_WARNINGS)
endif()
if(${PORTFFT_KERNEL_LOG})
target_compile_definitions(portfft INTERFACE PORTFFT_KERNEL_LOG)
endif()

target_compile_options(portfft INTERFACE -fgpu-inline-threshold=1000000)
Expand Down
80 changes: 57 additions & 23 deletions src/portfft/common/global.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,16 +203,23 @@ void launch_kernel(sycl::accessor<const Scalar, 1, sycl::access::mode::read>& in
const Scalar* impl_twiddles, const IdxGlobal* factors, const IdxGlobal* inner_batches,
const IdxGlobal* inclusive_scan, IdxGlobal n_transforms, IdxGlobal input_batch_offset,
std::pair<sycl::range<1>, sycl::range<1>> launch_params, sycl::handler& cgh) {
PORTFFT_LOG_FUNCTION_ENTRY();
auto [global_range, local_range] = launch_params;
#ifdef PORTFFT_LOG
#ifdef PORTFFT_KERNEL_LOG
sycl::stream s{1024 * 16, 1024, cgh};
#endif
PORTFFT_LOG_TRACE("Launching kernel for global implementation with global_size", global_range[0], "local_size",
local_range[0]);
cgh.parallel_for<global_kernel<Scalar, Domain, memory::BUFFER, LayoutIn, LayoutOut, SubgroupSize>>(
sycl::nd_range<1>(global_range, local_range),
[=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) {
sycl::nd_range<1>(global_range, local_range), [=
#ifdef PORTFFT_KERNEL_LOG
,
global_logging_config = detail::global_logging_config
#endif
](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) {
detail::global_data_struct global_data{
#ifdef PORTFFT_LOG
s,
#ifdef PORTFFT_KERNEL_LOG
s, global_logging_config,
#endif
it};
dispatch_level<Scalar, LayoutIn, LayoutOut, SubgroupSize>(
Expand Down Expand Up @@ -254,16 +261,26 @@ void launch_kernel(const Scalar* input, Scalar* output, const Scalar* input_imag
const Scalar* impl_twiddles, const IdxGlobal* factors, const IdxGlobal* inner_batches,
const IdxGlobal* inclusive_scan, IdxGlobal n_transforms, IdxGlobal input_batch_offset,
std::pair<sycl::range<1>, sycl::range<1>> launch_params, sycl::handler& cgh) {
PORTFFT_LOG_FUNCTION_ENTRY();
#ifdef PORTFFT_LOG
sycl::stream s{1024 * 16 * 16, 1024, cgh};
#endif
auto [global_range, local_range] = launch_params;
#ifdef PORTFFT_KERNEL_LOG
sycl::stream s{1024 * 16, 1024, cgh};
#endif
PORTFFT_LOG_TRACE("Launching kernel for global implementation with global_size", global_range[0], "local_size",
local_range[0]);
cgh.parallel_for<global_kernel<Scalar, Domain, memory::USM, LayoutIn, LayoutOut, SubgroupSize>>(
sycl::nd_range<1>(global_range, local_range),
[=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) {
sycl::nd_range<1>(global_range, local_range), [=
#ifdef PORTFFT_KERNEL_LOG
,
global_logging_config = detail::global_logging_config
#endif
](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) {
detail::global_data_struct global_data{
#ifdef PORTFFT_LOG
s,
#ifdef PORTFFT_KERNEL_LOG
s, global_logging_config,
#endif
it};
dispatch_level<Scalar, LayoutIn, LayoutOut, SubgroupSize>(
Expand Down Expand Up @@ -294,17 +311,23 @@ static void dispatch_transpose_kernel_impl(const Scalar* input,
sycl::local_accessor<Scalar, 2>& loc, const IdxGlobal* factors,
const IdxGlobal* inner_batches, const IdxGlobal* inclusive_scan,
IdxGlobal output_offset, IdxGlobal lda, IdxGlobal ldb, sycl::handler& cgh) {
#ifdef PORTFFT_LOG
PORTFFT_LOG_FUNCTION_ENTRY();
#ifdef PORTFFT_KERNEL_LOG
sycl::stream s{1024 * 16, 1024, cgh};
#endif
std::size_t lda_rounded = detail::round_up_to_multiple(static_cast<std::size_t>(lda), static_cast<std::size_t>(16));
std::size_t ldb_rounded = detail::round_up_to_multiple(static_cast<std::size_t>(ldb), static_cast<std::size_t>(16));
PORTFFT_LOG_TRACE("Launching transpose kernel with global_size", lda_rounded, ldb_rounded, "local_size", 16, 16);
cgh.parallel_for<detail::transpose_kernel<Scalar, memory::BUFFER>>(
sycl::nd_range<2>({detail::round_up_to_multiple(static_cast<std::size_t>(lda), static_cast<std::size_t>(16)),
detail::round_up_to_multiple(static_cast<std::size_t>(ldb), static_cast<std::size_t>(16))},
{16, 16}),
[=](sycl::nd_item<2> it, sycl::kernel_handler kh) {
sycl::nd_range<2>({lda_rounded, ldb_rounded}, {16, 16}), [=
#ifdef PORTFFT_KERNEL_LOG
,
global_logging_config = detail::global_logging_config
#endif
](sycl::nd_item<2> it, sycl::kernel_handler kh) {
detail::global_data_struct global_data{
#ifdef PORTFFT_LOG
s,
#ifdef PORTFFT_KERNEL_LOG
s, global_logging_config,
#endif
it};
global_data.log_message_global("entering transpose kernel - buffer impl");
Expand Down Expand Up @@ -347,17 +370,23 @@ static void dispatch_transpose_kernel_impl(const Scalar* input, Scalar* output,
const IdxGlobal* factors, const IdxGlobal* inner_batches,
const IdxGlobal* inclusive_scan, IdxGlobal output_offset, IdxGlobal lda,
IdxGlobal ldb, sycl::handler& cgh) {
#ifdef PORTFFT_LOG
PORTFFT_LOG_FUNCTION_ENTRY();
#ifdef PORTFFT_KERNEL_LOG
sycl::stream s{1024 * 16 * 16, 1024, cgh};
#endif
std::size_t lda_rounded = detail::round_up_to_multiple(static_cast<std::size_t>(lda), static_cast<std::size_t>(16));
std::size_t ldb_rounded = detail::round_up_to_multiple(static_cast<std::size_t>(ldb), static_cast<std::size_t>(16));
PORTFFT_LOG_TRACE("Launching transpose kernel with global_size", lda_rounded, ldb_rounded, "local_size", 16, 16);
cgh.parallel_for<detail::transpose_kernel<Scalar, memory::USM>>(
sycl::nd_range<2>({detail::round_up_to_multiple(static_cast<std::size_t>(lda), static_cast<std::size_t>(16)),
detail::round_up_to_multiple(static_cast<std::size_t>(ldb), static_cast<std::size_t>(16))},
{16, 16}),
[=](sycl::nd_item<2> it, sycl::kernel_handler kh) {
sycl::nd_range<2>({lda_rounded, ldb_rounded}, {16, 16}), [=
#ifdef PORTFFT_KERNEL_LOG
,
global_logging_config = detail::global_logging_config
#endif
](sycl::nd_item<2> it, sycl::kernel_handler kh) {
detail::global_data_struct global_data{
#ifdef PORTFFT_LOG
s,
#ifdef PORTFFT_KERNEL_LOG
s, global_logging_config,
#endif
it};
global_data.log_message_global("entering transpose kernel - USM impl");
Expand Down Expand Up @@ -407,6 +436,7 @@ sycl::event transpose_level(const typename committed_descriptor<Scalar, Domain>:
Idx num_batches_in_l2, IdxGlobal n_transforms, IdxGlobal batch_start, Idx total_factors,
IdxGlobal output_offset, sycl::queue& queue, const std::vector<sycl::event>& events,
complex_storage storage) {
PORTFFT_LOG_FUNCTION_ENTRY();
const IdxGlobal vec_size = storage == complex_storage::INTERLEAVED_COMPLEX ? 2 : 1;
std::vector<sycl::event> transpose_events;
IdxGlobal ld_input = kd_struct.factors.at(1);
Expand Down Expand Up @@ -477,6 +507,7 @@ std::vector<sycl::event> compute_level(
IdxGlobal intermediate_twiddle_offset, IdxGlobal subimpl_twiddle_offset, IdxGlobal input_global_offset,
IdxGlobal committed_size, Idx num_batches_in_l2, IdxGlobal n_transforms, IdxGlobal batch_start, Idx factor_id,
Idx total_factors, complex_storage storage, const std::vector<sycl::event>& dependencies, sycl::queue& queue) {
PORTFFT_LOG_FUNCTION_ENTRY();
IdxGlobal local_range = kd_struct.local_range;
IdxGlobal global_range = kd_struct.global_range;
IdxGlobal batch_size = kd_struct.batch_size;
Expand Down Expand Up @@ -508,6 +539,9 @@ std::vector<sycl::event> compute_level(
const IdxGlobal* inclusive_scan = factors_triple + 2 * total_factors;
const Idx vec_size = storage == complex_storage::INTERLEAVED_COMPLEX ? 2 : 1;
std::vector<sycl::event> events;
PORTFFT_LOG_TRACE("Local mem requirement - input:", local_memory_for_input, "store modifiers",
local_mem_for_store_modifier, "twiddles", loc_mem_for_twiddles, "total",
local_memory_for_input + local_mem_for_store_modifier + loc_mem_for_twiddles);
for (Idx batch_in_l2 = 0; batch_in_l2 < num_batches_in_l2 && batch_in_l2 + batch_start < n_transforms;
batch_in_l2++) {
events.push_back(queue.submit([&](sycl::handler& cgh) {
Expand Down
Loading
Loading