diff --git a/CMakeLists.txt b/CMakeLists.txt index fb4af632..27c34084 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,8 @@ option(PORTFFT_CLANG_TIDY "Enable clang-tidy checks on portFFT source when build option(PORTFFT_CLANG_TIDY_AUTOFIX "Attempt to fix defects found by clang-tidy" OFF) option(PORTFFT_LOG_DUMPS "Whether to enable logging of data dumps" OFF) option(PORTFFT_LOG_TRANSFERS "Whether to enable logging of memory transfers" OFF) -option(PORTFFT_LOG_TRACE "Whether to enable tracing of function calls" OFF) +option(PORTFFT_LOG_TRACES "Whether to enable tracing of function calls" OFF) +option(PORTFFT_LOG_WARNINGS "Whether to enable logging of warnings" ON) set(PORTFFT_REGISTERS_PER_WI 128 CACHE STRING "How many 32b registers can be allocated per work item on the target device") set(PORTFFT_SUBGROUP_SIZES 32 CACHE STRING "Comma separated list of subgroup sizes to compile for. The first size supported by the device will be used.") set(PORTFFT_VEC_LOAD_BYTES 16 CACHE STRING "Number of consecutive bytes each work item should load at once.") @@ -92,21 +93,24 @@ if(${PORTFFT_ENABLE_OOP_BUILDS}) target_compile_definitions(portfft INTERFACE PORTFFT_ENABLE_OOP_BUILDS) endif() -set(PORTFFT_LOG OFF) +set(PORTFFT_KERNEL_LOG OFF) if(${PORTFFT_LOG_DUMPS}) target_compile_definitions(portfft INTERFACE PORTFFT_LOG_DUMPS) - set(PORTFFT_LOG ON) + set(PORTFFT_KERNEL_LOG ON) endif() if(${PORTFFT_LOG_TRANSFERS}) target_compile_definitions(portfft INTERFACE PORTFFT_LOG_TRANSFERS) - set(PORTFFT_LOG ON) + set(PORTFFT_KERNEL_LOG ON) endif() -if(${PORTFFT_LOG_TRACE}) - target_compile_definitions(portfft INTERFACE PORTFFT_LOG_TRACE) - set(PORTFFT_LOG ON) +if(${PORTFFT_LOG_TRACES}) + target_compile_definitions(portfft INTERFACE PORTFFT_LOG_TRACES) + set(PORTFFT_KERNEL_LOG ON) endif() -if(${PORTFFT_LOG}) - target_compile_definitions(portfft INTERFACE PORTFFT_LOG) +if(${PORTFFT_LOG_WARNINGS}) + target_compile_definitions(portfft INTERFACE PORTFFT_LOG_WARNINGS) +endif() +if(${PORTFFT_KERNEL_LOG}) + target_compile_definitions(portfft INTERFACE PORTFFT_KERNEL_LOG) endif() target_compile_options(portfft INTERFACE -fgpu-inline-threshold=1000000) diff --git a/src/portfft/common/global.hpp b/src/portfft/common/global.hpp index 17acdf39..bc40e55b 100644 --- a/src/portfft/common/global.hpp +++ b/src/portfft/common/global.hpp @@ -203,16 +203,23 @@ void launch_kernel(sycl::accessor& in const Scalar* impl_twiddles, const IdxGlobal* factors, const IdxGlobal* inner_batches, const IdxGlobal* inclusive_scan, IdxGlobal n_transforms, IdxGlobal input_batch_offset, std::pair, sycl::range<1>> launch_params, sycl::handler& cgh) { + PORTFFT_LOG_FUNCTION_ENTRY(); auto [global_range, local_range] = launch_params; -#ifdef PORTFFT_LOG +#ifdef PORTFFT_KERNEL_LOG sycl::stream s{1024 * 16, 1024, cgh}; #endif + PORTFFT_LOG_TRACE("Launching kernel for global implementation with global_size", global_range[0], "local_size", + local_range[0]); cgh.parallel_for>( - sycl::nd_range<1>(global_range, local_range), - [=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { + sycl::nd_range<1>(global_range, local_range), [= +#ifdef PORTFFT_KERNEL_LOG + , + global_logging_config = detail::global_logging_config +#endif + ](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { detail::global_data_struct global_data{ -#ifdef PORTFFT_LOG - s, +#ifdef PORTFFT_KERNEL_LOG + s, global_logging_config, #endif it}; dispatch_level( @@ -254,16 +261,26 @@ void launch_kernel(const Scalar* input, Scalar* output, const Scalar* input_imag const Scalar* impl_twiddles, const IdxGlobal* factors, const IdxGlobal* inner_batches, const IdxGlobal* inclusive_scan, IdxGlobal n_transforms, IdxGlobal input_batch_offset, std::pair, sycl::range<1>> launch_params, sycl::handler& cgh) { + PORTFFT_LOG_FUNCTION_ENTRY(); #ifdef PORTFFT_LOG sycl::stream s{1024 * 16 * 16, 1024, cgh}; #endif auto [global_range, local_range] = launch_params; +#ifdef PORTFFT_KERNEL_LOG + sycl::stream s{1024 * 16, 1024, cgh}; +#endif + PORTFFT_LOG_TRACE("Launching kernel for global implementation with global_size", global_range[0], "local_size", + local_range[0]); cgh.parallel_for>( - sycl::nd_range<1>(global_range, local_range), - [=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { + sycl::nd_range<1>(global_range, local_range), [= +#ifdef PORTFFT_KERNEL_LOG + , + global_logging_config = detail::global_logging_config +#endif + ](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { detail::global_data_struct global_data{ -#ifdef PORTFFT_LOG - s, +#ifdef PORTFFT_KERNEL_LOG + s, global_logging_config, #endif it}; dispatch_level( @@ -294,17 +311,23 @@ static void dispatch_transpose_kernel_impl(const Scalar* input, sycl::local_accessor& loc, const IdxGlobal* factors, const IdxGlobal* inner_batches, const IdxGlobal* inclusive_scan, IdxGlobal output_offset, IdxGlobal lda, IdxGlobal ldb, sycl::handler& cgh) { -#ifdef PORTFFT_LOG + PORTFFT_LOG_FUNCTION_ENTRY(); +#ifdef PORTFFT_KERNEL_LOG sycl::stream s{1024 * 16, 1024, cgh}; #endif + std::size_t lda_rounded = detail::round_up_to_multiple(static_cast(lda), static_cast(16)); + std::size_t ldb_rounded = detail::round_up_to_multiple(static_cast(ldb), static_cast(16)); + PORTFFT_LOG_TRACE("Launching transpose kernel with global_size", lda_rounded, ldb_rounded, "local_size", 16, 16); cgh.parallel_for>( - sycl::nd_range<2>({detail::round_up_to_multiple(static_cast(lda), static_cast(16)), - detail::round_up_to_multiple(static_cast(ldb), static_cast(16))}, - {16, 16}), - [=](sycl::nd_item<2> it, sycl::kernel_handler kh) { + sycl::nd_range<2>({lda_rounded, ldb_rounded}, {16, 16}), [= +#ifdef PORTFFT_KERNEL_LOG + , + global_logging_config = detail::global_logging_config +#endif + ](sycl::nd_item<2> it, sycl::kernel_handler kh) { detail::global_data_struct global_data{ -#ifdef PORTFFT_LOG - s, +#ifdef PORTFFT_KERNEL_LOG + s, global_logging_config, #endif it}; global_data.log_message_global("entering transpose kernel - buffer impl"); @@ -347,17 +370,23 @@ static void dispatch_transpose_kernel_impl(const Scalar* input, Scalar* output, const IdxGlobal* factors, const IdxGlobal* inner_batches, const IdxGlobal* inclusive_scan, IdxGlobal output_offset, IdxGlobal lda, IdxGlobal ldb, sycl::handler& cgh) { -#ifdef PORTFFT_LOG + PORTFFT_LOG_FUNCTION_ENTRY(); +#ifdef PORTFFT_KERNEL_LOG sycl::stream s{1024 * 16 * 16, 1024, cgh}; #endif + std::size_t lda_rounded = detail::round_up_to_multiple(static_cast(lda), static_cast(16)); + std::size_t ldb_rounded = detail::round_up_to_multiple(static_cast(ldb), static_cast(16)); + PORTFFT_LOG_TRACE("Launching transpose kernel with global_size", lda_rounded, ldb_rounded, "local_size", 16, 16); cgh.parallel_for>( - sycl::nd_range<2>({detail::round_up_to_multiple(static_cast(lda), static_cast(16)), - detail::round_up_to_multiple(static_cast(ldb), static_cast(16))}, - {16, 16}), - [=](sycl::nd_item<2> it, sycl::kernel_handler kh) { + sycl::nd_range<2>({lda_rounded, ldb_rounded}, {16, 16}), [= +#ifdef PORTFFT_KERNEL_LOG + , + global_logging_config = detail::global_logging_config +#endif + ](sycl::nd_item<2> it, sycl::kernel_handler kh) { detail::global_data_struct global_data{ -#ifdef PORTFFT_LOG - s, +#ifdef PORTFFT_KERNEL_LOG + s, global_logging_config, #endif it}; global_data.log_message_global("entering transpose kernel - USM impl"); @@ -407,6 +436,7 @@ sycl::event transpose_level(const typename committed_descriptor: Idx num_batches_in_l2, IdxGlobal n_transforms, IdxGlobal batch_start, Idx total_factors, IdxGlobal output_offset, sycl::queue& queue, const std::vector& events, complex_storage storage) { + PORTFFT_LOG_FUNCTION_ENTRY(); const IdxGlobal vec_size = storage == complex_storage::INTERLEAVED_COMPLEX ? 2 : 1; std::vector transpose_events; IdxGlobal ld_input = kd_struct.factors.at(1); @@ -477,6 +507,7 @@ std::vector compute_level( IdxGlobal intermediate_twiddle_offset, IdxGlobal subimpl_twiddle_offset, IdxGlobal input_global_offset, IdxGlobal committed_size, Idx num_batches_in_l2, IdxGlobal n_transforms, IdxGlobal batch_start, Idx factor_id, Idx total_factors, complex_storage storage, const std::vector& dependencies, sycl::queue& queue) { + PORTFFT_LOG_FUNCTION_ENTRY(); IdxGlobal local_range = kd_struct.local_range; IdxGlobal global_range = kd_struct.global_range; IdxGlobal batch_size = kd_struct.batch_size; @@ -508,6 +539,9 @@ std::vector compute_level( const IdxGlobal* inclusive_scan = factors_triple + 2 * total_factors; const Idx vec_size = storage == complex_storage::INTERLEAVED_COMPLEX ? 2 : 1; std::vector events; + PORTFFT_LOG_TRACE("Local mem requirement - input:", local_memory_for_input, "store modifiers", + local_mem_for_store_modifier, "twiddles", loc_mem_for_twiddles, "total", + local_memory_for_input + local_mem_for_store_modifier + loc_mem_for_twiddles); for (Idx batch_in_l2 = 0; batch_in_l2 < num_batches_in_l2 && batch_in_l2 + batch_start < n_transforms; batch_in_l2++) { events.push_back(queue.submit([&](sycl::handler& cgh) { diff --git a/src/portfft/common/logging.hpp b/src/portfft/common/logging.hpp index efef47ce..6a12ab8d 100644 --- a/src/portfft/common/logging.hpp +++ b/src/portfft/common/logging.hpp @@ -28,13 +28,61 @@ namespace portfft::detail { +struct logging_config { + bool log_transfers = true; + bool log_dumps = true; + bool log_trace = true; + bool log_warnings = true; + logging_config() { + char* log_transfers_str = getenv("PORTFFT_LOG_TRANSFERS"); + if (log_transfers_str != nullptr) { + log_transfers = static_cast(atoi(log_transfers_str)); +#ifndef PORTFFT_LOG_TRANSFERS + if (log_transfers) { + std::cerr << "Can not enable logging of transfers if it is disabled at compile time." << std::endl; + } +#endif + } + char* log_dumps_str = getenv("PORTFFT_LOG_DUMPS"); + if (log_dumps_str != nullptr) { + log_dumps = static_cast(atoi(log_dumps_str)); +#ifndef PORTFFT_LOG_DUMPS + if (log_dumps) { + std::cerr << "Can not enable logging of dumps if it is disabled at compile time." << std::endl; + } +#endif + } + char* log_trace_str = getenv("PORTFFT_LOG_TRACE"); + if (log_trace_str != nullptr) { + log_trace = static_cast(atoi(log_trace_str)); +#ifndef PORTFFT_LOG_TRACE + if (log_trace) { + std::cerr << "Can not enable logging of traces if it is disabled at compile time." << std::endl; + } +#endif + } + char* log_warnings_str = getenv("PORTFFT_LOG_WARNINGS"); + if (log_warnings_str != nullptr) { + log_warnings = static_cast(atoi(log_warnings_str)); +#ifndef PORTFFT_LOG_WARNINGS + if (log_warnings) { + std::cerr << "Can not enable logging of warnings if it is disabled at compile time." << std::endl; + } +#endif + } + } +}; + +const logging_config global_logging_config; + /** * Struct containing objects that are used in almost all functions. */ template struct global_data_struct { -#ifdef PORTFFT_LOG +#ifdef PORTFFT_KERNEL_LOG sycl::stream s; + logging_config global_logging_config; #endif sycl::nd_item it; sycl::sub_group sg; @@ -46,13 +94,14 @@ struct global_data_struct { * @param it nd_item of the kernel */ global_data_struct( -#ifdef PORTFFT_LOG - sycl::stream s, +#ifdef PORTFFT_KERNEL_LOG + sycl::stream s, logging_config global_logging_config, #endif sycl::nd_item it) : -#ifdef PORTFFT_LOG +#ifdef PORTFFT_KERNEL_LOG s(s << sycl::setprecision(3)), + global_logging_config(global_logging_config), #endif it(it), sg(it.get_sub_group()) { @@ -71,7 +120,7 @@ struct global_data_struct { } } -#ifdef PORTFFT_LOG +#ifdef PORTFFT_KERNEL_LOG /** * Logs ids of workitem, subgroup and workgroup. */ @@ -93,8 +142,7 @@ struct global_data_struct { } /** - * Implementation of log_message. End of recursion - logs the messages separated by newlines, adds a newline and - * flushes the stream. + * Implementation of log_message. * * @tparam TFirst type of the first object to log * @tparam Ts types of the other objects to log @@ -122,7 +170,7 @@ struct global_data_struct { PORTFFT_INLINE void log_dump_local([[maybe_unused]] const char* message, [[maybe_unused]] ViewT data, [[maybe_unused]] Idx num) { #ifdef PORTFFT_LOG_DUMPS - if (it.get_local_id(0) == 0) { + if (global_logging_config.log_dumps && it.get_local_id(0) == 0) { s << "wg_id " << it.get_group(0); s << " " << message << " "; if (num) { @@ -150,15 +198,17 @@ struct global_data_struct { PORTFFT_INLINE void log_dump_private([[maybe_unused]] const char* message, [[maybe_unused]] T* ptr, [[maybe_unused]] Idx num) { #ifdef PORTFFT_LOG_DUMPS - log_ids(); - s << message << " "; - if (num) { - s << ptr[0]; - } - for (Idx i = 1; i < num; i++) { - s << ", " << ptr[i]; + if (global_logging_config.log_dumps) { + log_ids(); + s << message << " "; + if (num) { + s << ptr[0]; + } + for (Idx i = 1; i < num; i++) { + s << ", " << ptr[i]; + } + s << "\n" << sycl::stream_manipulator::flush; } - s << "\n" << sycl::stream_manipulator::flush; #endif } @@ -173,8 +223,10 @@ struct global_data_struct { template PORTFFT_INLINE void log_message([[maybe_unused]] Ts... messages) { #ifdef PORTFFT_LOG_TRANSFERS - log_ids(); - log_message_impl(messages...); + if (global_logging_config.log_transfers) { + log_ids(); + log_message_impl(messages...); + } #endif } @@ -191,7 +243,7 @@ struct global_data_struct { template PORTFFT_INLINE void log_message_subgroup([[maybe_unused]] Ts... messages) { #ifdef PORTFFT_LOG_TRANSFERS - if (sg.leader()) { + if (global_logging_config.log_transfers && sg.leader()) { s << "sg_id " << sg.get_group_linear_id() << " " << "wg_id " << it.get_group(0) << " "; log_message_impl(messages...); @@ -211,7 +263,7 @@ struct global_data_struct { template PORTFFT_INLINE void log_message_local([[maybe_unused]] Ts... messages) { #ifdef PORTFFT_LOG_TRANSFERS - if (it.get_local_id(0) == 0) { + if (global_logging_config.log_transfers && it.get_local_id(0) == 0) { s << "wg_id " << it.get_group(0) << " "; log_message_impl(messages...); } @@ -230,7 +282,7 @@ struct global_data_struct { template PORTFFT_INLINE void log_message_global([[maybe_unused]] Ts... messages) { #ifdef PORTFFT_LOG_TRACE - if (it.get_global_id(0) == 0) { + if (global_logging_config.log_trace && it.get_global_id(0) == 0) { log_message_impl(messages...); } #endif @@ -261,6 +313,48 @@ struct global_data_struct { } }; +/* + * Outputs an object to std::cout. + * + * @tparam T type of the object to output + * @param object object to output + */ +template +void output(const T& object) { + if constexpr (std::is_enum_v) { + output(static_cast>(object)); + } else { + std::cout << object; + } +} + +/* + * Outputs an object to std::cout. A `std::vector` is output by elements. + * + * @tparam T type of the object to output + * @param object object to output + */ +template +void output(const std::vector& object) { + std::cout << "("; + for (const T& element : object) { + output(element); + std::cout << ", "; + } + std::cout << ")"; +} + +/** + * Logs a message. + * + * @tparam Ts types of the objects to log + * @param messages messages to log + */ +template +void log_message_impl(Ts... messages) { + ((output(messages), std::cout << " "), ...); + std::cout << std::endl; +} /** * Prints the message and dumps data from host to standard output * @@ -270,14 +364,15 @@ struct global_data_struct { * @param size number of elements to dump */ template -PORTFFT_INLINE void dump_host([[maybe_unused]] const char* msg, [[maybe_unused]] T* host_ptr, - [[maybe_unused]] std::size_t size) { +void dump_host([[maybe_unused]] const char* msg, [[maybe_unused]] T* host_ptr, [[maybe_unused]] std::size_t size) { #ifdef PORTFFT_LOG_DUMPS - std::cout << msg << " "; - for (std::size_t i = 0; i < size; i++) { - std::cout << host_ptr[i] << ", "; + if (global_logging_config.log_dumps) { + std::cout << msg << " "; + for (std::size_t i = 0; i < size; i++) { + std::cout << host_ptr[i] << ", "; + } + std::cout << std::endl; } - std::cout << std::endl; #endif } @@ -292,16 +387,60 @@ PORTFFT_INLINE void dump_host([[maybe_unused]] const char* msg, [[maybe_unused]] * @param dependencies dependencies to wait on */ template -PORTFFT_INLINE void dump_device([[maybe_unused]] sycl::queue& q, [[maybe_unused]] const char* msg, - [[maybe_unused]] T* dev_ptr, [[maybe_unused]] std::size_t size, - [[maybe_unused]] const std::vector& dependencies = {}) { +void dump_device([[maybe_unused]] sycl::queue& q, [[maybe_unused]] const char* msg, [[maybe_unused]] T* dev_ptr, + [[maybe_unused]] std::size_t size, + [[maybe_unused]] const std::vector& dependencies = {}) { #ifdef PORTFFT_LOG_DUMPS - std::vector tmp(size); - q.copy(dev_ptr, tmp.data(), size, dependencies).wait(); - dump_host(msg, tmp.data(), size); + if (global_logging_config.log_dumps) { + std::vector tmp(size); + q.copy(dev_ptr, tmp.data(), size, dependencies).wait(); + dump_host(msg, tmp.data(), size); + } #endif } +/** + * Logs a trace. Can log multiple objects/strings. They will be separated by spaces. + * + * Does nothing if logging of traces is not enabled (PORTFFT_LOG_TRACE is not defined). + * + * @tparam Ts types of the objects to log + * @param messages objects to log + */ +template +void log_trace([[maybe_unused]] const Ts&... messages) { +#ifdef PORTFFT_LOG_TRACES + if (global_logging_config.log_trace) { + log_message_impl(messages...); + } +#endif +} + +/** + * Logs a warning. Can log multiple objects/strings. They will be separated by spaces. + * + * Does nothing if logging of warnings is not enabled (PORTFFT_LOG_WARNING is not defined). + * + * @tparam Ts types of the objects to log + * @param messages objects to log + */ +template +void log_warning([[maybe_unused]] const Ts&... messages) { +#ifdef PORTFFT_LOG_WARNINGS + if (global_logging_config.log_warnings) { + log_message_impl("WARNING:", messages...); + } +#endif +} + +#define PORTFFT_LOGGING_LOCATION_INFORMATION __FILE__ ", line", __LINE__, "- in", __FUNCTION__, ":" + +#define PORTFFT_LOG_FUNCTION_ENTRY() portfft::detail::log_trace(PORTFFT_LOGGING_LOCATION_INFORMATION, "entered") + +#define PORTFFT_LOG_TRACE(...) portfft::detail::log_trace(PORTFFT_LOGGING_LOCATION_INFORMATION, __VA_ARGS__) + +#define PORTFFT_LOG_WARNING(...) portfft::detail::log_warning(PORTFFT_LOGGING_LOCATION_INFORMATION, __VA_ARGS__) + }; // namespace portfft::detail #endif diff --git a/src/portfft/defines.hpp b/src/portfft/defines.hpp index 756bffdf..9fcd41fd 100644 --- a/src/portfft/defines.hpp +++ b/src/portfft/defines.hpp @@ -23,7 +23,8 @@ #include -#ifdef PORTFFT_LOG +#ifdef PORTFFT_KERNEL_LOG +// to avoid extremely long compile times - logging from kernel kills performance anyway #define PORTFFT_INLINE __attribute__((noinline)) #else #define PORTFFT_INLINE __attribute__((always_inline)) diff --git a/src/portfft/descriptor.hpp b/src/portfft/descriptor.hpp index 6a4aec56..ab41cc33 100644 --- a/src/portfft/descriptor.hpp +++ b/src/portfft/descriptor.hpp @@ -75,6 +75,7 @@ class transpose_kernel; * @param lengths the dimensions of the dft */ inline std::vector get_default_strides(const std::vector& lengths) { + PORTFFT_LOG_FUNCTION_ENTRY(); std::vector strides(lengths.size()); std::size_t total_size = 1; for (std::size_t i_plus1 = lengths.size(); i_plus1 > 0; i_plus1--) { @@ -82,6 +83,7 @@ inline std::vector get_default_strides(const std::vector std::tuple prepare_implementation(std::size_t kernel_num) { + PORTFFT_LOG_FUNCTION_ENTRY(); // TODO: check and support all the parameter values if constexpr (Domain != domain::COMPLEX) { throw unsupported_configuration("portFFT only supports complex to complex transforms"); @@ -332,6 +335,7 @@ class committed_descriptor { IdxGlobal fft_size = static_cast(params.lengths[kernel_num]); if (detail::fits_in_wi(fft_size)) { ids = detail::get_ids(); + PORTFFT_LOG_TRACE("Prepared workitem impl for size: ", fft_size); return {detail::level::WORKITEM, {{detail::level::WORKITEM, ids, factors}}}; } if (detail::fits_in_sg(fft_size, SubgroupSize)) { @@ -342,6 +346,7 @@ class committed_descriptor { factors.push_back(factor_wi); factors.push_back(factor_sg); ids = detail::get_ids(); + PORTFFT_LOG_TRACE("Prepared subgroup impl with factor_wi:", factor_wi, "and factor_sg:", factor_sg); return {detail::level::SUBGROUP, {{detail::level::SUBGROUP, ids, factors}}}; } IdxGlobal n_idx_global = detail::factorize(fft_size); @@ -372,9 +377,12 @@ class committed_descriptor { // This factorization of N and M is duplicated in the dispatch logic on the device. // The CT and spec constant factors should match. ids = detail::get_ids(); + PORTFFT_LOG_TRACE("Prepared workgroup impl with factor_wi_n:", factor_wi_n, " factor_sg_n:", factor_sg_n, + " factor_wi_m:", factor_wi_m, " factor_sg_m:", factor_sg_m); return {detail::level::WORKGROUP, {{detail::level::WORKGROUP, ids, factors}}}; } } + PORTFFT_LOG_TRACE("Preparing global impl"); std::vector, std::vector>> param_vec; auto check_and_select_target_level = [&](IdxGlobal factor_size, bool batch_interleaved_layout = true) -> bool { if (detail::fits_in_wi(factor_size)) { @@ -382,7 +390,7 @@ class committed_descriptor { param_vec.emplace_back(detail::level::WORKITEM, detail::get_ids(), std::vector{static_cast(factor_size)}); - + PORTFFT_LOG_TRACE("Workitem kernel for factor:", factor_size); return true; } bool fits_in_local_memory_subgroup = [&]() { @@ -410,11 +418,13 @@ class committed_descriptor { }(); if (detail::fits_in_sg(factor_size, SubgroupSize) && fits_in_local_memory_subgroup && !PORTFFT_SLOW_SG_SHUFFLES) { + Idx factor_sg = detail::factorize_sg(static_cast(factor_size), SubgroupSize); + Idx factor_wi = static_cast(factor_size) / factor_sg; + PORTFFT_LOG_TRACE("Subgroup kernel for factor:", factor_size, "with factor_wi:", factor_wi, + "and factor_sg:", factor_sg); param_vec.emplace_back(detail::level::SUBGROUP, detail::get_ids(), - std::vector{detail::factorize_sg(static_cast(factor_size), SubgroupSize), - static_cast(factor_size) / - detail::factorize_sg(static_cast(factor_size), SubgroupSize)}); + std::vector{factor_sg, factor_wi}); return true; } return false; @@ -458,16 +468,27 @@ class committed_descriptor { detail::apply_scale_factor scale_factor_applied, detail::level level, detail::complex_conjugate conjugate_on_load, detail::complex_conjugate conjugate_on_store, Scalar scale_factor, Idx factor_num = 0, Idx num_factors = 0) { + PORTFFT_LOG_FUNCTION_ENTRY(); const Idx length_idx = static_cast(length); // These spec constants are used in all implementations, so we set them here + PORTFFT_LOG_TRACE("Setting specialization constants:"); + PORTFFT_LOG_TRACE("SpecConstComplexStorage:", params.complex_storage); in_bundle.template set_specialization_constant(params.complex_storage); + PORTFFT_LOG_TRACE("SpecConstNumRealsPerFFT:", 2 * length_idx); in_bundle.template set_specialization_constant(2 * length_idx); + PORTFFT_LOG_TRACE("SpecConstWIScratchSize:", 2 * detail::wi_temps(length_idx)); in_bundle.template set_specialization_constant(2 * detail::wi_temps(length_idx)); + PORTFFT_LOG_TRACE("SpecConstMultiplyOnLoad:", multiply_on_load); in_bundle.template set_specialization_constant(multiply_on_load); + PORTFFT_LOG_TRACE("SpecConstMultiplyOnStore:", multiply_on_store); in_bundle.template set_specialization_constant(multiply_on_store); + PORTFFT_LOG_TRACE("SpecConstApplyScaleFactor:", scale_factor_applied); in_bundle.template set_specialization_constant(scale_factor_applied); + PORTFFT_LOG_TRACE("SpecConstConjugateOnLoad:", conjugate_on_load); in_bundle.template set_specialization_constant(conjugate_on_load); + PORTFFT_LOG_TRACE("SpecConstConjugateOnStore:", conjugate_on_store); in_bundle.template set_specialization_constant(conjugate_on_store); + PORTFFT_LOG_TRACE("get_spec_constant_scale:", scale_factor); in_bundle.template set_specialization_constant()>(scale_factor); dispatch(top_level, in_bundle, length, factors, level, factor_num, num_factors); @@ -499,6 +520,7 @@ class committed_descriptor { template std::size_t num_scalars_in_local_mem(detail::level level, std::size_t length, Idx used_sg_size, const std::vector& factors, Idx& num_sgs_per_wg) { + PORTFFT_LOG_FUNCTION_ENTRY(); return dispatch(level, length, used_sg_size, factors, num_sgs_per_wg); } @@ -523,6 +545,7 @@ class committed_descriptor { */ Scalar* calculate_twiddles(detail::level level, dimension_struct& dimension_data, std::vector& kernels) { + PORTFFT_LOG_FUNCTION_ENTRY(); return dispatch(level, dimension_data, kernels); } @@ -590,10 +613,12 @@ class committed_descriptor { scale_factor_applied, level, conjugate_on_load, conjugate_on_store, scale_factor); } try { + PORTFFT_LOG_TRACE("Building kernel bundle with subgroup size", SubgroupSize); result.emplace_back(sycl::build(in_bundle), factors, params.lengths[dimension_num], SubgroupSize, PORTFFT_SGS_IN_WG, std::shared_ptr(), level); + PORTFFT_LOG_TRACE("Kernel bundle build complete."); } catch (std::exception& e) { - std::cerr << "Build for subgroup size " << SubgroupSize << " failed with message:\n" << e.what() << std::endl; + PORTFFT_LOG_WARNING("Build for subgroup size", SubgroupSize, "failed with message:\n", e.what()); is_compatible = false; break; } @@ -617,6 +642,7 @@ class committed_descriptor { */ template dimension_struct build_w_spec_const(std::size_t dimension_num, bool skip_scaling) { + PORTFFT_LOG_FUNCTION_ENTRY(); if (std::count(supported_sg_sizes.begin(), supported_sg_sizes.end(), SubgroupSize)) { auto [top_level, prepared_vec] = prepare_implementation(dimension_num); bool is_compatible = true; @@ -650,6 +676,7 @@ class committed_descriptor { * @param num_global_level_dimensions number of global level dimensions in the committed size */ void allocate_scratch_and_precompute_scan(Idx num_global_level_dimensions) { + PORTFFT_LOG_FUNCTION_ENTRY(); std::size_t n_kernels = params.lengths.size(); if (num_global_level_dimensions == 1) { std::size_t global_dimension = 0; @@ -673,7 +700,7 @@ class committed_descriptor { } dimensions.at(global_dimension).num_factors = static_cast(factors.size()); std::size_t cache_space_left_for_batches = static_cast(llc_size) - cache_required_for_twiddles; - // TODO: In case of mutli-dim (single dim global sized), this should be batches corresposding to that dim + // TODO: In case of multi-dim (single dim global sized), this should be batches corresponding to that dim dimensions.at(global_dimension).num_batches_in_l2 = static_cast(std::min( static_cast(PORTFFT_MAX_CONCURRENT_KERNELS), std::min(params.number_of_transforms, @@ -681,18 +708,16 @@ class committed_descriptor { (2 * dimensions.at(global_dimension).length * sizeof(Scalar)))))); scratch_space_required = 2 * dimensions.at(global_dimension).length * static_cast(dimensions.at(global_dimension).num_batches_in_l2); - scratch_ptr_1 = - detail::make_shared(2 * dimensions.at(global_dimension).length * - static_cast(dimensions.at(global_dimension).num_batches_in_l2), - queue); - scratch_ptr_2 = - detail::make_shared(2 * dimensions.at(global_dimension).length * - static_cast(dimensions.at(global_dimension).num_batches_in_l2), - queue); + PORTFFT_LOG_TRACE("Allocating 2 scratch arrays of size", scratch_space_required, "scalars in global memory"); + scratch_ptr_1 = detail::make_shared(scratch_space_required, queue); + scratch_ptr_2 = detail::make_shared(scratch_space_required, queue); inclusive_scan.push_back(factors.at(0)); for (std::size_t i = 1; i < factors.size(); i++) { inclusive_scan.push_back(inclusive_scan.at(i - 1) * factors.at(i)); } + PORTFFT_LOG_TRACE("Dimension:", global_dimension, + "num_batches_in_l2:", dimensions.at(global_dimension).num_batches_in_l2, + "scan:", inclusive_scan); dimensions.at(global_dimension).factors_and_scan = detail::make_shared(factors.size() + sub_batches.size() + inclusive_scan.size(), queue); queue.copy(factors.data(), dimensions.at(global_dimension).factors_and_scan.get(), factors.size()); @@ -708,8 +733,12 @@ class committed_descriptor { std::vector ids; auto in_bundle = sycl::get_kernel_bundle(queue.get_context(), detail::get_transpose_kernel_ids()); + PORTFFT_LOG_TRACE("Setting specialization constants for transpose kernel", i); + PORTFFT_LOG_TRACE("SpecConstComplexStorage:", params.complex_storage); in_bundle.template set_specialization_constant(params.complex_storage); + PORTFFT_LOG_TRACE("GlobalSpecConstLevelNum:", i); in_bundle.template set_specialization_constant(static_cast(i)); + PORTFFT_LOG_TRACE("GlobalSpecConstNumFactors:", factors.size()); in_bundle.template set_specialization_constant( static_cast(factors.size())); dimensions.at(global_dimension) @@ -761,7 +790,10 @@ class committed_descriptor { for (std::size_t j = 0; j < num_transposes_required; j++) { auto in_bundle = sycl::get_kernel_bundle( queue.get_context(), detail::get_transpose_kernel_ids()); + PORTFFT_LOG_TRACE("Setting specilization constants for transpose kernel", j); + PORTFFT_LOG_TRACE("GlobalSpecConstLevelNum:", i); in_bundle.template set_specialization_constant(static_cast(i)); + PORTFFT_LOG_TRACE("GlobalSpecConstNumFactors:", factors.size()); in_bundle.template set_specialization_constant( static_cast(factors.size())); dimensions.at(i).transpose_kernels.emplace_back( @@ -790,8 +822,14 @@ class committed_descriptor { supported_sg_sizes(dev.get_info()), local_memory_size(static_cast(queue.get_device().get_info())), llc_size(static_cast(queue.get_device().get_info())) { - // check it's suitable to run + PORTFFT_LOG_FUNCTION_ENTRY(); + PORTFFT_LOG_TRACE("Device info:"); + PORTFFT_LOG_TRACE("n_compute_units:", n_compute_units); + PORTFFT_LOG_TRACE("supported_sg_sizes:", supported_sg_sizes); + PORTFFT_LOG_TRACE("local_memory_size:", local_memory_size); + PORTFFT_LOG_TRACE("llc_size:", llc_size); + // check it's suitable to run const auto forward_layout = detail::get_layout(params, direction::FORWARD); const auto backward_layout = detail::get_layout(params, direction::BACKWARD); if (params.lengths.size() > 1) { @@ -829,6 +867,7 @@ class committed_descriptor { calculate_twiddles(dimensions.back().level, dimensions.at(i), dimensions.back().backward_kernels), [queue](Scalar* ptr) { if (ptr != nullptr) { + PORTFFT_LOG_TRACE("Freeing the array for twiddle factors"); sycl::free(ptr, queue); } }); @@ -862,6 +901,7 @@ class committed_descriptor { * @param desc committed_descriptor of which the copy is to be made */ void create_copy(const committed_descriptor& desc) { + PORTFFT_LOG_FUNCTION_ENTRY(); #define PORTFFT_COPY(x) this->x = desc.x; PORTFFT_COPY(params) PORTFFT_COPY(queue) @@ -883,6 +923,7 @@ class committed_descriptor { } } if (is_scratch_required) { + PORTFFT_LOG_TRACE("Allocating 2 scratch arrays of size", desc.scratch_space_required, "Scalars in global memory"); this->scratch_ptr_1 = detail::make_shared(static_cast(desc.scratch_space_required), this->queue); this->scratch_ptr_2 = @@ -891,13 +932,19 @@ class committed_descriptor { } public: - committed_descriptor(const committed_descriptor& desc) : params(desc.params) { create_copy(desc); } + committed_descriptor(const committed_descriptor& desc) : params(desc.params) { + PORTFFT_LOG_FUNCTION_ENTRY(); + create_copy(desc); + } + committed_descriptor& operator=(const committed_descriptor& desc) { + PORTFFT_LOG_FUNCTION_ENTRY(); if (this != &desc) { create_copy(desc); } return *this; } + static_assert(std::is_same_v || std::is_same_v, "Scalar must be either float or double!"); /** @@ -912,7 +959,10 @@ class committed_descriptor { /** * Destructor */ - ~committed_descriptor() { queue.wait(); } + ~committed_descriptor() { + PORTFFT_LOG_FUNCTION_ENTRY(); + queue.wait(); + } // default construction is not appropriate committed_descriptor() = delete; @@ -923,6 +973,7 @@ class committed_descriptor { * @param inout buffer containing input and output data */ void compute_forward(sycl::buffer& inout) { + PORTFFT_LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. compute_forward(inout, inout); @@ -935,6 +986,7 @@ class committed_descriptor { * @param inout_imag buffer containing imaginary part of the input and output data */ void compute_forward(sycl::buffer& inout_real, sycl::buffer& inout_imag) { + PORTFFT_LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. compute_forward(inout_real, inout_imag, inout_real, inout_imag); @@ -946,6 +998,7 @@ class committed_descriptor { * @param inout buffer containing input and output data */ void compute_backward(sycl::buffer& inout) { + PORTFFT_LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. compute_backward(inout, inout); @@ -958,6 +1011,7 @@ class committed_descriptor { * @param inout_imag buffer containing imaginary part of the input and output data */ void compute_backward(sycl::buffer& inout_real, sycl::buffer& inout_imag) { + PORTFFT_LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. compute_backward(inout_real, inout_imag, inout_real, inout_imag); @@ -970,6 +1024,7 @@ class committed_descriptor { * @param out buffer containing output data */ void compute_forward(const sycl::buffer& in, sycl::buffer& out) { + PORTFFT_LOG_FUNCTION_ENTRY(); dispatch_direction(in, out, in, out, complex_storage::INTERLEAVED_COMPLEX, direction::FORWARD); } @@ -983,6 +1038,7 @@ class committed_descriptor { */ void compute_forward(const sycl::buffer& in_real, const sycl::buffer& in_imag, sycl::buffer& out_real, sycl::buffer& out_imag) { + PORTFFT_LOG_FUNCTION_ENTRY(); dispatch_direction(in_real, out_real, in_imag, out_imag, complex_storage::SPLIT_COMPLEX, direction::FORWARD); } @@ -993,6 +1049,7 @@ class committed_descriptor { * @param out buffer containing output data */ void compute_forward(const sycl::buffer& /*in*/, sycl::buffer& /*out*/) { + PORTFFT_LOG_FUNCTION_ENTRY(); throw unsupported_configuration("Real to complex FFTs not yet implemented."); } @@ -1003,6 +1060,7 @@ class committed_descriptor { * @param out buffer containing output data */ void compute_backward(const sycl::buffer& in, sycl::buffer& out) { + PORTFFT_LOG_FUNCTION_ENTRY(); dispatch_direction(in, out, in, out, complex_storage::INTERLEAVED_COMPLEX, direction::BACKWARD); } @@ -1016,6 +1074,7 @@ class committed_descriptor { */ void compute_backward(const sycl::buffer& in_real, const sycl::buffer& in_imag, sycl::buffer& out_real, sycl::buffer& out_imag) { + PORTFFT_LOG_FUNCTION_ENTRY(); dispatch_direction(in_real, out_real, in_imag, out_imag, complex_storage::SPLIT_COMPLEX, direction::BACKWARD); } @@ -1027,6 +1086,7 @@ class committed_descriptor { * @return sycl::event associated with this computation */ sycl::event compute_forward(complex_type* inout, const std::vector& dependencies = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. return compute_forward(inout, inout, dependencies); @@ -1042,6 +1102,7 @@ class committed_descriptor { */ sycl::event compute_forward(scalar_type* inout_real, scalar_type* inout_imag, const std::vector& dependencies = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. return compute_forward(inout_real, inout_imag, inout_real, inout_imag, dependencies); @@ -1055,6 +1116,7 @@ class committed_descriptor { * @return sycl::event associated with this computation */ sycl::event compute_forward(Scalar* inout, const std::vector& dependencies = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. return compute_forward(inout, reinterpret_cast(inout), dependencies); @@ -1068,6 +1130,7 @@ class committed_descriptor { * @return sycl::event associated with this computation */ sycl::event compute_backward(complex_type* inout, const std::vector& dependencies = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); return compute_backward(inout, inout, dependencies); } @@ -1081,6 +1144,7 @@ class committed_descriptor { */ sycl::event compute_backward(scalar_type* inout_real, scalar_type* inout_imag, const std::vector& dependencies = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); return compute_backward(inout_real, inout_imag, inout_real, inout_imag, dependencies); } @@ -1094,6 +1158,7 @@ class committed_descriptor { */ sycl::event compute_forward(const complex_type* in, complex_type* out, const std::vector& dependencies = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); return dispatch_direction(in, out, in, out, complex_storage::INTERLEAVED_COMPLEX, direction::FORWARD, dependencies); } @@ -1109,6 +1174,7 @@ class committed_descriptor { */ sycl::event compute_forward(const scalar_type* in_real, const scalar_type* in_imag, scalar_type* out_real, scalar_type* out_imag, const std::vector& dependencies = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); return dispatch_direction(in_real, out_real, in_imag, out_imag, complex_storage::SPLIT_COMPLEX, direction::FORWARD, dependencies); } @@ -1123,6 +1189,7 @@ class committed_descriptor { */ sycl::event compute_forward(const Scalar* /*in*/, complex_type* /*out*/, const std::vector& /*dependencies*/ = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); throw unsupported_configuration("Real to complex FFTs not yet implemented."); return {}; } @@ -1137,6 +1204,7 @@ class committed_descriptor { */ sycl::event compute_backward(const complex_type* in, complex_type* out, const std::vector& dependencies = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); return dispatch_direction(in, out, in, out, complex_storage::INTERLEAVED_COMPLEX, direction::BACKWARD, dependencies); } @@ -1153,6 +1221,7 @@ class committed_descriptor { */ sycl::event compute_backward(const scalar_type* in_real, const scalar_type* in_imag, scalar_type* out_real, scalar_type* out_imag, const std::vector& dependencies = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); return dispatch_direction(in_real, out_real, in_imag, out_imag, complex_storage::SPLIT_COMPLEX, direction::BACKWARD, dependencies); } @@ -1180,6 +1249,7 @@ class committed_descriptor { sycl::event dispatch_direction(const TIn& in, TOut& out, const TIn& in_imag, TOut& out_imag, complex_storage used_storage, direction compute_direction, const std::vector& dependencies = {}) { + PORTFFT_LOG_FUNCTION_ENTRY(); #ifndef PORTFFT_ENABLE_BUFFER_BUILDS if constexpr (!std::is_pointer_v || !std::is_pointer_v) { throw invalid_configuration("Buffer interface can not be called when buffer builds are disabled."); @@ -1235,6 +1305,7 @@ class committed_descriptor { const std::vector& output_strides, std::size_t input_distance, std::size_t output_distance, std::size_t input_offset, std::size_t output_offset, direction compute_direction) { + PORTFFT_LOG_FUNCTION_ENTRY(); using TOutConst = std::conditional_t, const std::remove_pointer_t*, const TOut>; std::size_t n_dimensions = params.lengths.size(); std::size_t total_size = params.get_flattened_length(); @@ -1264,6 +1335,7 @@ class committed_descriptor { output_distance = params.lengths.back(); } + PORTFFT_LOG_TRACE("Dispatching the kernel for the last dimension"); sycl::event previous_event = dispatch_kernel_1d(in, out, in_imag, out_imag, dependencies, params.number_of_transforms * outer_size, input_stride_0, output_stride_0, input_distance, output_distance, input_offset, @@ -1279,6 +1351,7 @@ class committed_descriptor { // TODO do everything from the next loop in a single kernel once we support more than one distance in the // kernels. std::size_t stride_between_kernels = inner_size * params.lengths[i]; + PORTFFT_LOG_TRACE("Dispatching the kernels for the dimension", i); for (std::size_t j = 0; j < params.number_of_transforms * outer_size; j++) { sycl::event e = dispatch_kernel_1d( out, out, out_imag, out_imag, previous_events, inner_size, inner_size, inner_size, 1, 1, @@ -1324,6 +1397,7 @@ class committed_descriptor { std::size_t input_stride, std::size_t output_stride, std::size_t input_distance, std::size_t output_distance, std::size_t input_offset, std::size_t output_offset, dimension_struct& dimension_data, direction compute_direction) { + PORTFFT_LOG_FUNCTION_ENTRY(); return dispatch_kernel_1d_helper( in, out, in_imag, out_imag, dependencies, n_transforms, input_stride, output_stride, input_distance, output_distance, input_offset, output_offset, dimension_data, compute_direction); @@ -1363,6 +1437,7 @@ class committed_descriptor { std::size_t output_distance, std::size_t input_offset, std::size_t output_offset, dimension_struct& dimension_data, direction compute_direction) { + PORTFFT_LOG_FUNCTION_ENTRY(); if (SubgroupSize == dimension_data.used_sg_size) { const bool input_packed = input_distance == dimension_data.length && input_stride == 1; const bool output_packed = output_distance == dimension_data.length && output_stride == 1; @@ -1375,6 +1450,8 @@ class committed_descriptor { kernel_data.level, kernel_data.length, SubgroupSize, kernel_data.factors, kernel_data.num_sgs_per_wg) * sizeof(Scalar); + PORTFFT_LOG_TRACE("Local mem required:", minimum_local_mem_required, "B. Available: ", local_memory_size, + "B."); if (static_cast(minimum_local_mem_required) > local_memory_size) { throw out_of_local_memory_error( "Insufficient amount of local memory available: " + std::to_string(local_memory_size) + @@ -1463,6 +1540,7 @@ class committed_descriptor { const std::vector& dependencies, std::size_t n_transforms, std::size_t input_offset, std::size_t output_offset, dimension_struct& dimension_data, direction compute_direction) { + PORTFFT_LOG_FUNCTION_ENTRY(); // mixing const and non-const inputs leads to hard-to-debug linking errors, as both use the same kernel name, but // are called from different template instantiations. static_assert(!std::is_pointer_v || std::is_const_v>, @@ -1585,6 +1663,7 @@ struct descriptor { */ explicit descriptor(const std::vector& lengths) : lengths(lengths), forward_strides(detail::get_default_strides(lengths)), backward_strides(forward_strides) { + PORTFFT_LOG_FUNCTION_ENTRY(); // TODO: properly set default values for distances for real transforms std::size_t total_size = get_flattened_length(); forward_distance = total_size; @@ -1597,7 +1676,10 @@ struct descriptor { * @param queue queue to use for computations * @return committed_descriptor */ - committed_descriptor commit(sycl::queue& queue) { return {*this, queue}; } + committed_descriptor commit(sycl::queue& queue) { + PORTFFT_LOG_FUNCTION_ENTRY(); + return {*this, queue}; + } /** * Get the flattened length of an FFT for a single batch, ignoring strides and distance. diff --git a/src/portfft/dispatcher/global_dispatcher.hpp b/src/portfft/dispatcher/global_dispatcher.hpp index 6779d712..2c476912 100644 --- a/src/portfft/dispatcher/global_dispatcher.hpp +++ b/src/portfft/dispatcher/global_dispatcher.hpp @@ -46,6 +46,7 @@ namespace detail { */ inline std::pair get_launch_params(IdxGlobal fft_size, IdxGlobal num_batches, detail::level level, Idx n_compute_units, Idx subgroup_size, Idx n_sgs_in_wg) { + PORTFFT_LOG_FUNCTION_ENTRY(); IdxGlobal n_available_sgs = 8 * n_compute_units * 64; IdxGlobal wg_size = n_sgs_in_wg * subgroup_size; if (level == detail::level::WORKITEM) { @@ -75,6 +76,7 @@ inline std::pair get_launch_params(IdxGlobal fft_size, Idx */ template void complex_transpose(const T* a, T* b, IdxGlobal lda, IdxGlobal ldb, IdxGlobal num_elements) { + PORTFFT_LOG_FUNCTION_ENTRY(); for (IdxGlobal i = 0; i < num_elements; i++) { IdxGlobal j = i / ldb; IdxGlobal k = i % ldb; @@ -90,6 +92,7 @@ void complex_transpose(const T* a, T* b, IdxGlobal lda, IdxGlobal ldb, IdxGlobal * @return value to increment the pointer by */ inline IdxGlobal increment_twiddle_offset(detail::level level, Idx factor_size) { + PORTFFT_LOG_FUNCTION_ENTRY(); if (level == detail::level::SUBGROUP) { return 2 * factor_size; } @@ -108,6 +111,7 @@ template struct committed_descriptor::calculate_twiddles_struct::inner { static Scalar* execute(committed_descriptor& desc, dimension_struct& /*dimension_data*/, std::vector& kernels) { + PORTFFT_LOG_FUNCTION_ENTRY(); std::vector factors_idx_global; // Get factor sizes per level; for (const auto& kernel_data : kernels) { @@ -142,6 +146,8 @@ struct committed_descriptor::calculate_twiddles_struct::inner host_memory(static_cast(mem_required_for_twiddles)); std::vector scratch_space(static_cast(mem_required_for_twiddles)); + PORTFFT_LOG_TRACE("Allocating global memory for twiddles for workgroup implementation. Allocation size", + mem_required_for_twiddles); Scalar* device_twiddles = sycl::malloc_device(static_cast(mem_required_for_twiddles), desc.queue); @@ -254,14 +260,21 @@ struct committed_descriptor::set_spec_constants_struct::inner& in_bundle, std::size_t length, const std::vector& factors, detail::level level, Idx factor_num, Idx num_factors) { + PORTFFT_LOG_FUNCTION_ENTRY(); Idx length_idx = static_cast(length); + PORTFFT_LOG_TRACE("GlobalSubImplSpecConst:", level); in_bundle.template set_specialization_constant(level); + PORTFFT_LOG_TRACE("GlobalSpecConstNumFactors:", num_factors); in_bundle.template set_specialization_constant(num_factors); + PORTFFT_LOG_TRACE("GlobalSpecConstLevelNum:", factor_num); in_bundle.template set_specialization_constant(factor_num); if (level == detail::level::WORKITEM || level == detail::level::WORKGROUP) { + PORTFFT_LOG_TRACE("SpecConstFftSize:", length_idx); in_bundle.template set_specialization_constant(length_idx); } else if (level == detail::level::SUBGROUP) { + PORTFFT_LOG_TRACE("SubgroupFactorWISpecConst:", factors[1]); in_bundle.template set_specialization_constant(factors[1]); + PORTFFT_LOG_TRACE("SubgroupFactorSGSpecConst:", factors[0]); in_bundle.template set_specialization_constant(factors[0]); } } @@ -273,6 +286,7 @@ struct committed_descriptor::num_scalars_in_local_mem_struct::in Dummy> { static std::size_t execute(committed_descriptor& /*desc*/, std::size_t /*length*/, Idx /*used_sg_size*/, const std::vector& /*factors*/, Idx& /*num_sgs_per_wg*/) { + PORTFFT_LOG_FUNCTION_ENTRY(); // No work required as all work done in calculate_twiddles; return 0; } @@ -287,6 +301,7 @@ struct committed_descriptor::run_kernel_struct& dependencies, IdxGlobal n_transforms, IdxGlobal input_offset, IdxGlobal output_offset, dimension_struct& dimension_data, direction compute_direction) { + PORTFFT_LOG_FUNCTION_ENTRY(); complex_storage storage = desc.params.complex_storage; const IdxGlobal vec_size = storage == complex_storage::INTERLEAVED_COMPLEX ? 2 : 1; const auto& kernels = @@ -309,9 +324,12 @@ struct committed_descriptor::run_kernel_struct(kernels.at(i).length); } for (std::size_t i = 0; i < num_batches; i += max_batches_in_l2) { + PORTFFT_LOG_TRACE("Global implementation working on batches", i, "through", i + max_batches_in_l2, "out of", + num_batches); IdxGlobal intermediate_twiddles_offset = 0; IdxGlobal impl_twiddle_offset = initial_impl_twiddle_offset; auto& kernel0 = kernels.at(0); + PORTFFT_LOG_TRACE("Dispatching the kernel for factor 0 of global implementation"); l2_events = detail::compute_level( kernel0, in, desc.scratch_ptr_1.get(), in_imag, desc.scratch_ptr_1.get() + imag_offset, twiddles_ptr, @@ -326,7 +344,9 @@ struct committed_descriptor::run_kernel_struct(dimension_data.num_factors); factor_num++) { auto& current_kernel = kernels.at(factor_num); + PORTFFT_LOG_TRACE("Dispatching the kernel for factor", factor_num, "of global implementation"); if (static_cast(factor_num) == dimension_data.num_factors - 1) { + PORTFFT_LOG_TRACE("This is the last kernel"); l2_events = detail::compute_level( current_kernel, desc.scratch_ptr_1.get(), desc.scratch_ptr_1.get(), @@ -354,6 +374,7 @@ struct committed_descriptor::run_kernel_struct 0; num_transpose--) { + PORTFFT_LOG_TRACE("Dispatching the transpose kernel", num_transpose); event = detail::transpose_level( dimension_data.transpose_kernels.at(static_cast(num_transpose)), desc.scratch_ptr_1.get(), desc.scratch_ptr_2.get(), factors_and_scan, committed_size, static_cast(max_batches_in_l2), @@ -367,6 +388,7 @@ struct committed_descriptor::run_kernel_struct( dimension_data.transpose_kernels.at(0), desc.scratch_ptr_1.get(), out, factors_and_scan, committed_size, static_cast(max_batches_in_l2), n_transforms, static_cast(i), num_factors, diff --git a/src/portfft/dispatcher/subgroup_dispatcher.hpp b/src/portfft/dispatcher/subgroup_dispatcher.hpp index dff453af..166af05e 100644 --- a/src/portfft/dispatcher/subgroup_dispatcher.hpp +++ b/src/portfft/dispatcher/subgroup_dispatcher.hpp @@ -47,6 +47,7 @@ namespace detail { template IdxGlobal get_global_size_subgroup(IdxGlobal n_transforms, Idx factor_sg, Idx subgroup_size, Idx num_sgs_per_wg, Idx n_compute_units) { + PORTFFT_LOG_FUNCTION_ENTRY(); Idx maximum_n_sgs = 2 * n_compute_units * 64; Idx maximum_n_wgs = maximum_n_sgs / num_sgs_per_wg; Idx wg_size = subgroup_size * num_sgs_per_wg; @@ -589,13 +590,18 @@ template struct committed_descriptor::calculate_twiddles_struct::inner { static Scalar* execute(committed_descriptor& desc, dimension_struct& /*dimension_data*/, std::vector& kernels) { + PORTFFT_LOG_FUNCTION_ENTRY(); const auto& kernel_data = kernels.at(0); Idx factor_wi = kernel_data.factors[0]; Idx factor_sg = kernel_data.factors[1]; + PORTFFT_LOG_TRACE("Allocating global memory for twiddles for subgroup implementation. Allocation size", + kernel_data.length * 2); Scalar* res = sycl::aligned_alloc_device( alignof(sycl::vec), kernel_data.length * 2, desc.queue); sycl::range<2> kernel_range({static_cast(factor_sg), static_cast(factor_wi)}); desc.queue.submit([&](sycl::handler& cgh) { + PORTFFT_LOG_TRACE("Launching twiddle calculation kernel for subgroup implementation with global size", factor_sg, + factor_wi); cgh.parallel_for(kernel_range, [=](sycl::item<2> it) { Idx n = static_cast(it.get_id(0)); Idx k = static_cast(it.get_id(1)); @@ -617,6 +623,7 @@ struct committed_descriptor::run_kernel_struct& dependencies, IdxGlobal n_transforms, IdxGlobal input_offset, IdxGlobal output_offset, dimension_struct& dimension_data, direction compute_direction) { + PORTFFT_LOG_FUNCTION_ENTRY(); constexpr detail::memory Mem = std::is_pointer_v ? detail::memory::USM : detail::memory::BUFFER; auto& kernel_data = compute_direction == direction::FORWARD ? dimension_data.forward_kernels.at(0) : dimension_data.backward_kernels.at(0); @@ -637,15 +644,23 @@ struct committed_descriptor::run_kernel_struct loc(local_elements, cgh); sycl::local_accessor loc_twiddles(twiddle_elements, cgh); -#ifdef PORTFFT_LOG +#ifdef PORTFFT_KERNEL_LOG sycl::stream s{1024 * 16 * 16, 1024 * 8, cgh}; #endif + PORTFFT_LOG_TRACE("Launching subgroup kernel with global_size", global_size, "local_size", + SubgroupSize * kernel_data.num_sgs_per_wg, "local memory allocation of size", local_elements, + "local memory allocation for twiddles of size", twiddle_elements); cgh.parallel_for>( sycl::nd_range<1>{{global_size}, {static_cast(SubgroupSize * kernel_data.num_sgs_per_wg)}}, - [=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { + [= +#ifdef PORTFFT_KERNEL_LOG + , + global_logging_config = detail::global_logging_config +#endif + ](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { detail::global_data_struct global_data{ -#ifdef PORTFFT_LOG - s, +#ifdef PORTFFT_KERNEL_LOG + s, global_logging_config, #endif it}; global_data.log_message_global("Running subgroup kernel"); @@ -665,7 +680,10 @@ struct committed_descriptor::set_spec_constants_struct::inner& in_bundle, std::size_t /*length*/, const std::vector& factors, detail::level /*level*/, Idx /*factor_num*/, Idx /*num_factors*/) { + PORTFFT_LOG_FUNCTION_ENTRY(); + PORTFFT_LOG_TRACE("SubgroupFactorWISpecConst:", factors[0]); in_bundle.template set_specialization_constant(factors[0]); + PORTFFT_LOG_TRACE("SubgroupFactorSGSpecConst:", factors[1]); in_bundle.template set_specialization_constant(factors[1]); } }; @@ -676,6 +694,7 @@ struct committed_descriptor::num_scalars_in_local_mem_struct::in Dummy> { static std::size_t execute(committed_descriptor& desc, std::size_t length, Idx used_sg_size, const std::vector& factors, Idx& num_sgs_per_wg) { + PORTFFT_LOG_FUNCTION_ENTRY(); Idx dft_length = static_cast(length); Idx twiddle_bytes = 2 * dft_length * static_cast(sizeof(Scalar)); if constexpr (LayoutIn == detail::layout::BATCH_INTERLEAVED) { diff --git a/src/portfft/dispatcher/workgroup_dispatcher.hpp b/src/portfft/dispatcher/workgroup_dispatcher.hpp index 99124310..dc047550 100644 --- a/src/portfft/dispatcher/workgroup_dispatcher.hpp +++ b/src/portfft/dispatcher/workgroup_dispatcher.hpp @@ -63,6 +63,7 @@ PORTFFT_INLINE constexpr Idx get_num_batches_in_local_mem_workgroup(Idx workgrou template IdxGlobal get_global_size_workgroup(IdxGlobal n_transforms, Idx subgroup_size, Idx num_sgs_per_wg, Idx n_compute_units) { + PORTFFT_LOG_FUNCTION_ENTRY(); Idx maximum_n_sgs = 8 * n_compute_units * 64; Idx maximum_n_wgs = maximum_n_sgs / num_sgs_per_wg; Idx wg_size = subgroup_size * num_sgs_per_wg; @@ -282,6 +283,7 @@ struct committed_descriptor::run_kernel_struct& dependencies, IdxGlobal n_transforms, IdxGlobal input_offset, IdxGlobal output_offset, dimension_struct& dimension_data, direction compute_direction) { + PORTFFT_LOG_FUNCTION_ENTRY(); auto& kernel_data = compute_direction == direction::FORWARD ? dimension_data.forward_kernels.at(0) : dimension_data.backward_kernels.at(0); Idx num_batches_in_local_mem = [=]() { @@ -310,15 +312,22 @@ struct committed_descriptor::run_kernel_struct loc(local_elements, cgh); -#ifdef PORTFFT_LOG +#ifdef PORTFFT_KERNEL_LOG sycl::stream s{1024 * 16 * 8 * 2, 1024, cgh}; #endif + PORTFFT_LOG_TRACE("Launching workgroup kernel with global_size", global_size, "local_size", + SubgroupSize * kernel_data.num_sgs_per_wg, "local memory allocation of size", local_elements); cgh.parallel_for>( sycl::nd_range<1>{{global_size}, {static_cast(SubgroupSize * PORTFFT_SGS_IN_WG)}}, - [=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { + [= +#ifdef PORTFFT_KERNEL_LOG + , + global_logging_config = detail::global_logging_config +#endif + ](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { detail::global_data_struct global_data{ -#ifdef PORTFFT_LOG - s, +#ifdef PORTFFT_KERNEL_LOG + s, global_logging_config, #endif it}; global_data.log_message_global("Running workgroup kernel"); @@ -338,7 +347,9 @@ struct committed_descriptor::set_spec_constants_struct::inner& in_bundle, std::size_t length, const std::vector& /*factors*/, detail::level /*level*/, Idx /*factor_num*/, Idx /*num_factors*/) { + PORTFFT_LOG_FUNCTION_ENTRY(); const Idx length_idx = static_cast(length); + PORTFFT_LOG_TRACE("SpecConstFftSize:", length_idx); in_bundle.template set_specialization_constant(length_idx); } }; @@ -349,6 +360,7 @@ struct committed_descriptor::num_scalars_in_local_mem_struct::in Dummy> { static std::size_t execute(committed_descriptor& /*desc*/, std::size_t length, Idx used_sg_size, const std::vector& factors, Idx& /*num_sgs_per_wg*/) { + PORTFFT_LOG_FUNCTION_ENTRY(); std::size_t n = static_cast(factors[0]) * static_cast(factors[1]); std::size_t m = static_cast(factors[2]) * static_cast(factors[3]); // working memory + twiddles for subgroup impl for the two sizes @@ -365,6 +377,7 @@ template struct committed_descriptor::calculate_twiddles_struct::inner { static Scalar* execute(committed_descriptor& desc, dimension_struct& /*dimension_data*/, std::vector& kernels) { + PORTFFT_LOG_FUNCTION_ENTRY(); const auto& kernel_data = kernels.at(0); Idx factor_wi_n = kernel_data.factors[0]; Idx factor_sg_n = kernel_data.factors[1]; @@ -374,10 +387,14 @@ struct committed_descriptor::calculate_twiddles_struct::inner(alignof(sycl::vec), static_cast(res_size), desc.queue); desc.queue.submit([&](sycl::handler& cgh) { + PORTFFT_LOG_TRACE( + "Launching twiddle calculation kernel for factor 1 of workgroup implementation with global size", factor_sg_n, + factor_wi_n); cgh.parallel_for(sycl::range<2>({static_cast(factor_sg_n), static_cast(factor_wi_n)}), [=](sycl::item<2> it) { Idx n = static_cast(it.get_id(0)); @@ -386,6 +403,9 @@ struct committed_descriptor::calculate_twiddles_struct::inner({static_cast(factor_sg_m), static_cast(factor_wi_m)}), [=](sycl::item<2> it) { Idx n = static_cast(it.get_id(0)); @@ -394,6 +414,8 @@ struct committed_descriptor::calculate_twiddles_struct::inner({static_cast(n), static_cast(factor_wi_m), static_cast(factor_sg_m)}), [=](sycl::item<3> it) { diff --git a/src/portfft/dispatcher/workitem_dispatcher.hpp b/src/portfft/dispatcher/workitem_dispatcher.hpp index 26388572..2e26fd90 100644 --- a/src/portfft/dispatcher/workitem_dispatcher.hpp +++ b/src/portfft/dispatcher/workitem_dispatcher.hpp @@ -45,6 +45,7 @@ namespace detail { */ template IdxGlobal get_global_size_workitem(IdxGlobal n_transforms, Idx subgroup_size, Idx num_sgs_per_wg, Idx n_compute_units) { + PORTFFT_LOG_FUNCTION_ENTRY(); Idx maximum_n_sgs = 8 * n_compute_units * 64; Idx maximum_n_wgs = maximum_n_sgs / num_sgs_per_wg; Idx wg_size = subgroup_size * num_sgs_per_wg; @@ -284,6 +285,7 @@ struct committed_descriptor::run_kernel_struct& dependencies, IdxGlobal n_transforms, IdxGlobal input_offset, IdxGlobal output_offset, dimension_struct& dimension_data, direction compute_direction) { + PORTFFT_LOG_FUNCTION_ENTRY(); constexpr detail::memory Mem = std::is_pointer_v ? detail::memory::USM : detail::memory::BUFFER; auto& kernel_data = compute_direction == direction::FORWARD ? dimension_data.forward_kernels.at(0) : dimension_data.backward_kernels.at(0); @@ -300,15 +302,22 @@ struct committed_descriptor::run_kernel_struct loc(static_cast(local_elements), cgh); -#ifdef PORTFFT_LOG +#ifdef PORTFFT_KERNEL_LOG sycl::stream s{1024 * 16 * 8, 1024, cgh}; #endif + PORTFFT_LOG_TRACE("Launching workitem kernel with global_size", global_size, "local_size", + SubgroupSize * kernel_data.num_sgs_per_wg, "local memory allocation of size", local_elements); cgh.parallel_for>( sycl::nd_range<1>{{global_size}, {static_cast(SubgroupSize * kernel_data.num_sgs_per_wg)}}, - [=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { + [= +#ifdef PORTFFT_KERNEL_LOG + , + global_logging_config = detail::global_logging_config +#endif + ](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { detail::global_data_struct global_data{ -#ifdef PORTFFT_LOG - s, +#ifdef PORTFFT_KERNEL_LOG + s, global_logging_config, #endif it}; global_data.log_message_global("Running workitem kernel"); @@ -328,7 +337,9 @@ struct committed_descriptor::set_spec_constants_struct::inner& in_bundle, std::size_t length, const std::vector& /*factors*/, detail::level /*level*/, Idx /*factor_num*/, Idx /*num_factors*/) { + PORTFFT_LOG_FUNCTION_ENTRY(); const Idx length_idx = static_cast(length); + PORTFFT_LOG_TRACE("SpecConstFftSize:", length_idx); in_bundle.template set_specialization_constant(length_idx); } }; @@ -339,6 +350,7 @@ struct committed_descriptor::num_scalars_in_local_mem_struct::in Dummy> { static std::size_t execute(committed_descriptor& desc, std::size_t length, Idx used_sg_size, const std::vector& /*factors*/, Idx& num_sgs_per_wg) { + PORTFFT_LOG_FUNCTION_ENTRY(); Idx num_scalars_per_sg = detail::pad_local(2 * static_cast(length) * used_sg_size, 1); Idx max_n_sgs = desc.local_memory_size / static_cast(sizeof(Scalar)) / num_scalars_per_sg; num_sgs_per_wg = std::min(Idx(PORTFFT_SGS_IN_WG), std::max(Idx(1), max_n_sgs)); @@ -352,6 +364,7 @@ template struct committed_descriptor::calculate_twiddles_struct::inner { static Scalar* execute(committed_descriptor& /*desc*/, dimension_struct& /*dimension_data*/, std::vector& /*kernels*/) { + PORTFFT_LOG_FUNCTION_ENTRY(); return nullptr; } }; diff --git a/src/portfft/enums.hpp b/src/portfft/enums.hpp index cbb1c607..bbffac64 100644 --- a/src/portfft/enums.hpp +++ b/src/portfft/enums.hpp @@ -39,7 +39,7 @@ enum class direction { FORWARD, BACKWARD }; constexpr direction inv(direction dir) { return dir == direction::FORWARD ? direction::BACKWARD : direction::FORWARD; } namespace detail { -enum class pad { DO_PAD, DONT_PAD }; +enum class pad { DONT_PAD, DO_PAD }; enum class level { WORKITEM, SUBGROUP, WORKGROUP, GLOBAL }; @@ -66,11 +66,11 @@ enum class transfer_direction { GLOBAL_TO_LOCAL }; -enum class elementwise_multiply { APPLIED, NOT_APPLIED }; +enum class elementwise_multiply { NOT_APPLIED, APPLIED }; -enum class apply_scale_factor { APPLIED, NOT_APPLIED }; +enum class apply_scale_factor { NOT_APPLIED, APPLIED }; -enum class complex_conjugate { APPLIED, NOT_APPLIED }; +enum class complex_conjugate { NOT_APPLIED, APPLIED }; } // namespace detail } // namespace portfft diff --git a/src/portfft/utils.hpp b/src/portfft/utils.hpp index 17abf7d0..3a79fbf3 100644 --- a/src/portfft/utils.hpp +++ b/src/portfft/utils.hpp @@ -26,6 +26,7 @@ #include #include +#include "common/logging.hpp" #include "defines.hpp" #include "enums.hpp" @@ -44,6 +45,7 @@ class transpose_kernel; template