From e7d7daf2dcf2776311d4e98cd7c4d554f537c85f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Mon, 29 Jan 2024 08:54:40 +0100 Subject: [PATCH 1/9] logging --- src/portfft/common/global.hpp | 24 ++-- src/portfft/common/logging.hpp | 91 ++++++++++++++- src/portfft/descriptor.hpp | 105 +++++++++++++++--- src/portfft/dispatcher/global_dispatcher.hpp | 20 ++++ .../dispatcher/subgroup_dispatcher.hpp | 10 ++ .../dispatcher/workgroup_dispatcher.hpp | 13 ++- .../dispatcher/workitem_dispatcher.hpp | 11 +- src/portfft/enums.hpp | 6 +- src/portfft/utils.hpp | 5 + 9 files changed, 254 insertions(+), 31 deletions(-) diff --git a/src/portfft/common/global.hpp b/src/portfft/common/global.hpp index 08b14258..5c7b498c 100644 --- a/src/portfft/common/global.hpp +++ b/src/portfft/common/global.hpp @@ -209,10 +209,12 @@ void launch_kernel(sycl::accessor& in const IdxGlobal* inclusive_scan, IdxGlobal n_transforms, Scalar scale_factor, IdxGlobal input_batch_offset, std::pair, sycl::range<1>> launch_params, sycl::handler& cgh) { + LOG_FUNCTION_ENTRY(); auto [global_range, local_range] = launch_params; #ifdef PORTFFT_LOG sycl::stream s{1024 * 16, 1024, cgh}; #endif + LOG_TRACE("Launching kernel for global implementation with global_size", global_range[0], "local_size", local_range[0]); cgh.parallel_for>( sycl::nd_range<1>(global_range, local_range), [=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { @@ -264,10 +266,12 @@ void launch_kernel(const Scalar* input, Scalar* output, const Scalar* input_imag const IdxGlobal* inclusive_scan, IdxGlobal n_transforms, Scalar scale_factor, IdxGlobal input_batch_offset, std::pair, sycl::range<1>> launch_params, sycl::handler& cgh) { + LOG_FUNCTION_ENTRY(); #ifdef PORTFFT_LOG sycl::stream s{1024 * 16 * 16, 1024, cgh}; #endif auto [global_range, local_range] = launch_params; + LOG_TRACE("Launching kernel for global implementation with global_size", global_range[0], "local_size", local_range[0]); cgh.parallel_for>( sycl::nd_range<1>(global_range, local_range), [=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { @@ -304,13 +308,15 @@ static void dispatch_transpose_kernel_impl(const Scalar* input, sycl::local_accessor& loc, const IdxGlobal* factors, const IdxGlobal* inner_batches, const IdxGlobal* inclusive_scan, IdxGlobal output_offset, IdxGlobal lda, IdxGlobal ldb, sycl::handler& cgh) { + LOG_FUNCTION_ENTRY(); #ifdef PORTFFT_LOG sycl::stream s{1024 * 16, 1024, cgh}; #endif + std::size_t lda_rounded = detail::round_up_to_multiple(static_cast(lda), static_cast(16)); + std::size_t ldb_rounded = detail::round_up_to_multiple(static_cast(ldb), static_cast(16)); + LOG_TRACE("Launching transpose kernel with global_size", lda_rounded, ldb_rounded, "local_size", 16, 16); cgh.parallel_for>( - sycl::nd_range<2>({detail::round_up_to_multiple(static_cast(lda), static_cast(16)), - detail::round_up_to_multiple(static_cast(ldb), static_cast(16))}, - {16, 16}), + sycl::nd_range<2>({lda_rounded, ldb_rounded}, {16, 16}), [=](sycl::nd_item<2> it, sycl::kernel_handler kh) { detail::global_data_struct global_data{ #ifdef PORTFFT_LOG @@ -357,13 +363,15 @@ static void dispatch_transpose_kernel_impl(const Scalar* input, Scalar* output, const IdxGlobal* factors, const IdxGlobal* inner_batches, const IdxGlobal* inclusive_scan, IdxGlobal output_offset, IdxGlobal lda, IdxGlobal ldb, sycl::handler& cgh) { + LOG_FUNCTION_ENTRY(); #ifdef PORTFFT_LOG sycl::stream s{1024 * 16 * 16, 1024, cgh}; #endif + std::size_t lda_rounded = detail::round_up_to_multiple(static_cast(lda), static_cast(16)); + std::size_t ldb_rounded = detail::round_up_to_multiple(static_cast(ldb), static_cast(16)); + LOG_TRACE("Launching transpose kernel with global_size", lda_rounded, ldb_rounded, "local_size", 16, 16); cgh.parallel_for>( - sycl::nd_range<2>({detail::round_up_to_multiple(static_cast(lda), static_cast(16)), - detail::round_up_to_multiple(static_cast(ldb), static_cast(16))}, - {16, 16}), + sycl::nd_range<2>({lda_rounded, ldb_rounded}, {16, 16}), [=](sycl::nd_item<2> it, sycl::kernel_handler kh) { detail::global_data_struct global_data{ #ifdef PORTFFT_LOG @@ -416,7 +424,8 @@ sycl::event transpose_level(const typename committed_descriptor: const Scalar* input, TOut output, const IdxGlobal* factors_triple, IdxGlobal committed_size, Idx num_batches_in_l2, IdxGlobal n_transforms, IdxGlobal batch_start, Idx total_factors, IdxGlobal output_offset, sycl::queue& queue, const std::vector& events, - complex_storage storage) { + complex_storage storage) { + LOG_FUNCTION_ENTRY(); const IdxGlobal vec_size = storage == complex_storage::INTERLEAVED_COMPLEX ? 2 : 1; std::vector transpose_events; IdxGlobal ld_input = kd_struct.factors.at(1); @@ -490,6 +499,7 @@ std::vector compute_level( IdxGlobal input_global_offset, IdxGlobal committed_size, Idx num_batches_in_l2, IdxGlobal n_transforms, IdxGlobal batch_start, Idx factor_id, Idx total_factors, complex_storage storage, const std::vector& dependencies, sycl::queue& queue) { + LOG_FUNCTION_ENTRY(); IdxGlobal local_range = kd_struct.local_range; IdxGlobal global_range = kd_struct.global_range; IdxGlobal batch_size = kd_struct.batch_size; diff --git a/src/portfft/common/logging.hpp b/src/portfft/common/logging.hpp index efef47ce..a677284a 100644 --- a/src/portfft/common/logging.hpp +++ b/src/portfft/common/logging.hpp @@ -93,8 +93,7 @@ struct global_data_struct { } /** - * Implementation of log_message. End of recursion - logs the messages separated by newlines, adds a newline and - * flushes the stream. + * Implementation of log_message. * * @tparam TFirst type of the first object to log * @tparam Ts types of the other objects to log @@ -261,6 +260,71 @@ struct global_data_struct { } }; +/* + * Outputs an object to std::cout. Most objects are piped directly to std::cout. + * + * @tparam T type of the object to output + * @param object object to output + */ +template >* = nullptr> +__attribute__((always_inline)) inline void output(T object) { + std::cout << object; +} + +/* + * Outputs an object to std::cout. Enums are first cast to underlying type. + * + * @tparam T type of the object to output + * @param object object to output + */ +template >* = nullptr> +__attribute__((always_inline)) inline void output(T object) { + output(static_cast>(object)); +} + +/* + * Outputs an object to std::cout. A `std::vector` is output by elements. + * + * @tparam T type of the object to output + * @param object object to output + */ +template +__attribute__((always_inline)) inline void output(const std::vector& object) { + std::cout << "("; + for(const T& element : object){ + output(element); + std::cout << ", "; + } + std::cout << ")"; +} + +/* + * Logs a message. End of recursion - logs the message, adds a newline and flushes the stream. + * + * @tparam T type of the object to log + * @param message message to log + */ +template +__attribute__((always_inline)) inline void log_message_impl(T message) { + output(message); + std::cout << std::endl; +} + +/** + * Logs a message. + * + * @tparam TFirst type of the first object to log + * @tparam Ts types of the other objects to log + * @param message the first message to log + * @param other_messages other messages to log + */ +template +__attribute__((always_inline)) inline void log_message_impl(TFirst message, Ts... other_messages) { + output(message); + std::cout << " "; + log_message_impl(other_messages...); +} + /** * Prints the message and dumps data from host to standard output * @@ -302,6 +366,29 @@ PORTFFT_INLINE void dump_device([[maybe_unused]] sycl::queue& q, [[maybe_unused] #endif } + /** + * Logs a message. Can log multiple objects/strings. They will be separated by spaces. + * + * Does nothing if logging of traces is not enabled (PORTFFT_LOG_TRACE is not defined). + * + * @tparam Ts types of the objects to log + * @param messages objects to log + */ + template + PORTFFT_INLINE void log_message([[maybe_unused]] Ts... messages) { +#ifdef PORTFFT_LOG_TRACE + log_message_impl(messages...); +#endif + } + +#define LOGGING_LOCATION_INFORMATION __FILE__ ", line", __LINE__, "- in", __FUNCTION__, ":" + +#define LOG_FUNCTION_ENTRY() \ + detail::log_message(LOGGING_LOCATION_INFORMATION, "entered") + +#define LOG_TRACE(...) \ + detail::log_message(LOGGING_LOCATION_INFORMATION, __VA_ARGS__) + }; // namespace portfft::detail #endif diff --git a/src/portfft/descriptor.hpp b/src/portfft/descriptor.hpp index f83aef6a..d8b77585 100644 --- a/src/portfft/descriptor.hpp +++ b/src/portfft/descriptor.hpp @@ -76,6 +76,7 @@ class transpose_kernel; * @param lengths the dimensions of the dft */ inline std::vector get_default_strides(const std::vector& lengths) { + LOG_FUNCTION_ENTRY(); std::vector strides(lengths.size()); std::size_t total_size = 1; for (std::size_t i_plus1 = lengths.size(); i_plus1 > 0; i_plus1--) { @@ -83,6 +84,7 @@ inline std::vector get_default_strides(const std::vector std::tuple, std::vector>>> prepare_implementation(std::size_t kernel_num) { + LOG_FUNCTION_ENTRY(); // TODO: check and support all the parameter values if constexpr (Domain != domain::COMPLEX) { throw unsupported_configuration("portFFT only supports complex to complex transforms"); @@ -322,6 +325,7 @@ class committed_descriptor { IdxGlobal fft_size = static_cast(params.lengths[kernel_num]); if (detail::fits_in_wi(fft_size)) { ids = detail::get_ids(); + LOG_TRACE("Prepared workitem impl for size: ", fft_size); return {detail::level::WORKITEM, {{detail::level::WORKITEM, ids, factors}}}; } if (detail::fits_in_sg(fft_size, SubgroupSize)) { @@ -332,6 +336,7 @@ class committed_descriptor { factors.push_back(factor_wi); factors.push_back(factor_sg); ids = detail::get_ids(); + LOG_TRACE("Prepared subgroup impl with factor_wi:", factor_wi, "and factor_sg:", factor_sg); return {detail::level::SUBGROUP, {{detail::level::SUBGROUP, ids, factors}}}; } IdxGlobal n_idx_global = detail::factorize(fft_size); @@ -362,9 +367,11 @@ class committed_descriptor { // This factorization of N and M is duplicated in the dispatch logic on the device. // The CT and spec constant factors should match. ids = detail::get_ids(); + LOG_TRACE("Prepared workgroup impl with factor_wi_n:", factor_wi_n, " factor_sg_n:", factor_sg_n, " factor_wi_m:", factor_wi_m, " factor_sg_m:", factor_sg_m); return {detail::level::WORKGROUP, {{detail::level::WORKGROUP, ids, factors}}}; } } + LOG_TRACE("Preparing global impl"); std::vector, std::vector>> param_vec; auto check_and_select_target_level = [&](IdxGlobal factor_size, bool batch_interleaved_layout = true) -> bool { if (detail::fits_in_wi(factor_size)) { @@ -372,7 +379,7 @@ class committed_descriptor { param_vec.emplace_back(detail::level::WORKITEM, detail::get_ids(), std::vector{static_cast(factor_size)}); - + LOG_TRACE("Workitem kernel for factor:", factor_size); return true; } bool fits_in_local_memory_subgroup = [&]() { @@ -400,11 +407,12 @@ class committed_descriptor { }(); if (detail::fits_in_sg(factor_size, SubgroupSize) && fits_in_local_memory_subgroup && !PORTFFT_SLOW_SG_SHUFFLES) { + Idx factor_sg = detail::factorize_sg(static_cast(factor_size), SubgroupSize); + Idx factor_wi = static_cast(factor_size) / factor_sg; + LOG_TRACE("Subgroup kernel for factor:", factor_size, "with factor_wi:", factor_wi, "and factor_sg:", factor_sg); param_vec.emplace_back(detail::level::SUBGROUP, detail::get_ids(), - std::vector{detail::factorize_sg(static_cast(factor_size), SubgroupSize), - static_cast(factor_size) / - detail::factorize_sg(static_cast(factor_size), SubgroupSize)}); + std::vector{factor_sg, factor_wi}); return true; } return false; @@ -444,13 +452,21 @@ class committed_descriptor { detail::elementwise_multiply multiply_on_load, detail::elementwise_multiply multiply_on_store, detail::apply_scale_factor scale_factor_applied, detail::level level, Idx factor_num = 0, Idx num_factors = 0) { + LOG_FUNCTION_ENTRY(); const Idx length_idx = static_cast(length); // These spec constants are used in all implementations, so we set them here + LOG_TRACE("Setting specialization constants:"); + LOG_TRACE("SpecConstComplexStorage:", params.complex_storage); in_bundle.template set_specialization_constant(params.complex_storage); + LOG_TRACE("SpecConstNumRealsPerFFT:", 2 * length_idx); in_bundle.template set_specialization_constant(2 * length_idx); + LOG_TRACE("SpecConstWIScratchSize:", 2 * detail::wi_temps(length_idx)); in_bundle.template set_specialization_constant(2 * detail::wi_temps(length_idx)); + LOG_TRACE("SpecConstMultiplyOnLoad:", multiply_on_load); in_bundle.template set_specialization_constant(multiply_on_load); + LOG_TRACE("SpecConstMultiplyOnStore:", multiply_on_store); in_bundle.template set_specialization_constant(multiply_on_store); + LOG_TRACE("SpecConstApplyScaleFactor:", scale_factor_applied); in_bundle.template set_specialization_constant(scale_factor_applied); dispatch(top_level, in_bundle, length, factors, level, factor_num, num_factors); } @@ -481,6 +497,7 @@ class committed_descriptor { template std::size_t num_scalars_in_local_mem(detail::level level, std::size_t length, Idx used_sg_size, const std::vector& factors, Idx& num_sgs_per_wg) { + LOG_FUNCTION_ENTRY(); return dispatch(level, length, used_sg_size, factors, num_sgs_per_wg); } @@ -502,6 +519,7 @@ class committed_descriptor { * @return Scalar* USM pointer to the twiddle factors */ Scalar* calculate_twiddles(dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); return dispatch(dimension_data.level, dimension_data); } @@ -516,6 +534,7 @@ class committed_descriptor { */ template dimension_struct build_w_spec_const(std::size_t kernel_num) { + LOG_FUNCTION_ENTRY(); if (std::count(supported_sg_sizes.begin(), supported_sg_sizes.end(), SubgroupSize)) { auto [top_level, prepared_vec] = prepare_implementation(kernel_num); bool is_compatible = true; @@ -579,6 +598,7 @@ class committed_descriptor { * @param num_global_level_dimensions number of global level dimensions in the committed size */ void allocate_scratch_and_precompute_scan(Idx num_global_level_dimensions) { + LOG_FUNCTION_ENTRY(); std::size_t n_kernels = params.lengths.size(); if (num_global_level_dimensions == 1) { std::size_t global_dimension = 0; @@ -602,7 +622,7 @@ class committed_descriptor { } dimensions.at(global_dimension).num_factors = static_cast(factors.size()); std::size_t cache_space_left_for_batches = static_cast(llc_size) - cache_required_for_twiddles; - // TODO: In case of mutli-dim (single dim global sized), this should be batches corresposding to that dim + // TODO: In case of mutli-dim (single dim global sized), this should be batches corresponding to that dim dimensions.at(global_dimension).num_batches_in_l2 = static_cast(std::min( static_cast(PORTFFT_MAX_CONCURRENT_KERNELS), std::min(params.number_of_transforms, @@ -610,14 +630,9 @@ class committed_descriptor { (2 * dimensions.at(global_dimension).length * sizeof(Scalar)))))); scratch_space_required = 2 * dimensions.at(global_dimension).length * static_cast(dimensions.at(global_dimension).num_batches_in_l2); - scratch_ptr_1 = - detail::make_shared(2 * dimensions.at(global_dimension).length * - static_cast(dimensions.at(global_dimension).num_batches_in_l2), - queue); - scratch_ptr_2 = - detail::make_shared(2 * dimensions.at(global_dimension).length * - static_cast(dimensions.at(global_dimension).num_batches_in_l2), - queue); + LOG_TRACE("Allocating 2 scratch arrays of size", scratch_space_required, "scalars in global memory"); + scratch_ptr_1 = detail::make_shared(scratch_space_required, queue); + scratch_ptr_2 = detail::make_shared(scratch_space_required, queue); inclusive_scan.push_back(factors.at(0)); for (std::size_t i = 1; i < factors.size(); i++) { inclusive_scan.push_back(inclusive_scan.at(i - 1) * factors.at(i)); @@ -637,8 +652,12 @@ class committed_descriptor { std::vector ids; auto in_bundle = sycl::get_kernel_bundle(queue.get_context(), detail::get_transpose_kernel_ids()); + LOG_TRACE("Setting specialization constants for transpose kernel", i); + LOG_TRACE("SpecConstComplexStorage:", params.complex_storage); in_bundle.template set_specialization_constant(params.complex_storage); + LOG_TRACE("GlobalSpecConstLevelNum:", i); in_bundle.template set_specialization_constant(static_cast(i)); + LOG_TRACE("GlobalSpecConstNumFactors:", factors.size()); in_bundle.template set_specialization_constant( static_cast(factors.size())); dimensions.at(global_dimension) @@ -690,7 +709,10 @@ class committed_descriptor { for (std::size_t j = 0; j < num_transposes_required; j++) { auto in_bundle = sycl::get_kernel_bundle( queue.get_context(), detail::get_transpose_kernel_ids()); + LOG_TRACE("Setting specilization constants for transpose kernel", j); + LOG_TRACE("GlobalSpecConstLevelNum:", i); in_bundle.template set_specialization_constant(static_cast(i)); + LOG_TRACE("GlobalSpecConstNumFactors:", factors.size()); in_bundle.template set_specialization_constant( static_cast(factors.size())); dimensions.at(i).kernels.emplace_back( @@ -719,8 +741,14 @@ class committed_descriptor { supported_sg_sizes(dev.get_info()), local_memory_size(static_cast(queue.get_device().get_info())), llc_size(static_cast(queue.get_device().get_info())) { - // check it's suitable to run + LOG_FUNCTION_ENTRY(); + LOG_TRACE("Device info:"); + LOG_TRACE("n_compute_units:", n_compute_units); + LOG_TRACE("supported_sg_sizes:", supported_sg_sizes); + LOG_TRACE("local_memory_size:", local_memory_size); + LOG_TRACE("llc_size:", llc_size); + // check it's suitable to run const auto forward_layout = detail::get_layout(params, direction::FORWARD); const auto backward_layout = detail::get_layout(params, direction::BACKWARD); if (params.lengths.size() > 1) { @@ -745,6 +773,7 @@ class committed_descriptor { dimensions.back().kernels.at(0).twiddles_forward = std::shared_ptr(calculate_twiddles(dimensions.back()), [queue](Scalar* ptr) { if (ptr != nullptr) { + LOG_TRACE("Freeing the array for twiddle factors"); sycl::free(ptr, queue); } }); @@ -778,6 +807,7 @@ class committed_descriptor { * @param desc committed_descriptor of which the copy is to be made */ void create_copy(const committed_descriptor& desc) { + LOG_FUNCTION_ENTRY(); #define PORTFFT_COPY(x) this->x = desc.x; PORTFFT_COPY(params) PORTFFT_COPY(queue) @@ -799,6 +829,7 @@ class committed_descriptor { } } if (is_scratch_required) { + LOG_TRACE("Allocating 2 scratch arrays of size", desc.scratch_space_required, "Scalars in global memory"); this->scratch_ptr_1 = detail::make_shared(static_cast(desc.scratch_space_required), this->queue); this->scratch_ptr_2 = @@ -807,13 +838,19 @@ class committed_descriptor { } public: - committed_descriptor(const committed_descriptor& desc) : params(desc.params) { create_copy(desc); } + committed_descriptor(const committed_descriptor& desc) : params(desc.params) { + LOG_FUNCTION_ENTRY(); + create_copy(desc); + } + committed_descriptor& operator=(const committed_descriptor& desc) { + LOG_FUNCTION_ENTRY(); if (this != &desc) { create_copy(desc); } return *this; } + static_assert(std::is_same_v || std::is_same_v, "Scalar must be either float or double!"); /** @@ -828,7 +865,10 @@ class committed_descriptor { /** * Destructor */ - ~committed_descriptor() { queue.wait(); } + ~committed_descriptor() { + LOG_FUNCTION_ENTRY(); + queue.wait(); + } // default construction is not appropriate committed_descriptor() = delete; @@ -839,6 +879,7 @@ class committed_descriptor { * @param inout buffer containing input and output data */ void compute_forward(sycl::buffer& inout) { + LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. compute_forward(inout, inout); @@ -851,6 +892,7 @@ class committed_descriptor { * @param inout_imag buffer containing imaginary part of the input and output data */ void compute_forward(sycl::buffer& inout_real, sycl::buffer& inout_imag) { + LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. compute_forward(inout_real, inout_imag, inout_real, inout_imag); @@ -862,6 +904,7 @@ class committed_descriptor { * @param inout buffer containing input and output data */ void compute_backward(sycl::buffer& inout) { + LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. compute_backward(inout, inout); @@ -874,6 +917,7 @@ class committed_descriptor { * @param inout_imag buffer containing imaginary part of the input and output data */ void compute_backward(sycl::buffer& inout_real, sycl::buffer& inout_imag) { + LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. compute_backward(inout_real, inout_imag, inout_real, inout_imag); @@ -886,6 +930,7 @@ class committed_descriptor { * @param out buffer containing output data */ void compute_forward(const sycl::buffer& in, sycl::buffer& out) { + LOG_FUNCTION_ENTRY(); dispatch_direction(in, out, in, out, complex_storage::INTERLEAVED_COMPLEX); } @@ -899,6 +944,7 @@ class committed_descriptor { */ void compute_forward(const sycl::buffer& in_real, const sycl::buffer& in_imag, sycl::buffer& out_real, sycl::buffer& out_imag) { + LOG_FUNCTION_ENTRY(); dispatch_direction(in_real, out_real, in_imag, out_imag, complex_storage::SPLIT_COMPLEX); } @@ -909,6 +955,7 @@ class committed_descriptor { * @param out buffer containing output data */ void compute_forward(const sycl::buffer& /*in*/, sycl::buffer& /*out*/) { + LOG_FUNCTION_ENTRY(); throw unsupported_configuration("Real to complex FFTs not yet implemented."); } @@ -919,6 +966,7 @@ class committed_descriptor { * @param out buffer containing output data */ void compute_backward(const sycl::buffer& in, sycl::buffer& out) { + LOG_FUNCTION_ENTRY(); dispatch_direction(in, out, in, out, complex_storage::INTERLEAVED_COMPLEX); } @@ -932,6 +980,7 @@ class committed_descriptor { */ void compute_backward(const sycl::buffer& in_real, const sycl::buffer& in_imag, sycl::buffer& out_real, sycl::buffer& out_imag) { + LOG_FUNCTION_ENTRY(); dispatch_direction(in_real, out_real, in_imag, out_imag, complex_storage::SPLIT_COMPLEX); } @@ -943,6 +992,7 @@ class committed_descriptor { * @return sycl::event associated with this computation */ sycl::event compute_forward(complex_type* inout, const std::vector& dependencies = {}) { + LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. return compute_forward(inout, inout, dependencies); @@ -958,6 +1008,7 @@ class committed_descriptor { */ sycl::event compute_forward(scalar_type* inout_real, scalar_type* inout_imag, const std::vector& dependencies = {}) { + LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. return compute_forward(inout_real, inout_imag, inout_real, inout_imag, dependencies); @@ -971,6 +1022,7 @@ class committed_descriptor { * @return sycl::event associated with this computation */ sycl::event compute_forward(Scalar* inout, const std::vector& dependencies = {}) { + LOG_FUNCTION_ENTRY(); // For now we can just call out-of-place implementation. // This might need to be changed once we implement support for large sizes that work in global memory. return compute_forward(inout, reinterpret_cast(inout), dependencies); @@ -984,6 +1036,7 @@ class committed_descriptor { * @return sycl::event associated with this computation */ sycl::event compute_backward(complex_type* inout, const std::vector& dependencies = {}) { + LOG_FUNCTION_ENTRY(); return compute_backward(inout, inout, dependencies); } @@ -997,6 +1050,7 @@ class committed_descriptor { */ sycl::event compute_backward(scalar_type* inout_real, scalar_type* inout_imag, const std::vector& dependencies = {}) { + LOG_FUNCTION_ENTRY(); return compute_backward(inout_real, inout_imag, inout_real, inout_imag, dependencies); } @@ -1010,6 +1064,7 @@ class committed_descriptor { */ sycl::event compute_forward(const complex_type* in, complex_type* out, const std::vector& dependencies = {}) { + LOG_FUNCTION_ENTRY(); return dispatch_direction(in, out, in, out, complex_storage::INTERLEAVED_COMPLEX, dependencies); } @@ -1025,6 +1080,7 @@ class committed_descriptor { */ sycl::event compute_forward(const scalar_type* in_real, const scalar_type* in_imag, scalar_type* out_real, scalar_type* out_imag, const std::vector& dependencies = {}) { + LOG_FUNCTION_ENTRY(); return dispatch_direction(in_real, out_real, in_imag, out_imag, complex_storage::SPLIT_COMPLEX, dependencies); } @@ -1039,6 +1095,7 @@ class committed_descriptor { */ sycl::event compute_forward(const Scalar* /*in*/, complex_type* /*out*/, const std::vector& /*dependencies*/ = {}) { + LOG_FUNCTION_ENTRY(); throw unsupported_configuration("Real to complex FFTs not yet implemented."); return {}; } @@ -1053,6 +1110,7 @@ class committed_descriptor { */ sycl::event compute_backward(const complex_type* in, complex_type* out, const std::vector& dependencies = {}) { + LOG_FUNCTION_ENTRY(); return dispatch_direction(in, out, in, out, complex_storage::INTERLEAVED_COMPLEX, dependencies); } @@ -1069,6 +1127,7 @@ class committed_descriptor { */ sycl::event compute_backward(const scalar_type* in_real, const scalar_type* in_imag, scalar_type* out_real, scalar_type* out_imag, const std::vector& dependencies = {}) { + LOG_FUNCTION_ENTRY(); return dispatch_direction(in_real, out_real, in_imag, out_imag, complex_storage::SPLIT_COMPLEX, dependencies); } @@ -1095,6 +1154,7 @@ class committed_descriptor { template sycl::event dispatch_direction(const TIn& in, TOut& out, const TIn& in_imag, TOut& out_imag, complex_storage used_storage, const std::vector& dependencies = {}) { + LOG_FUNCTION_ENTRY(); #ifndef PORTFFT_ENABLE_BUFFER_BUILDS if constexpr (!std::is_pointer_v || !std::is_pointer_v) { throw invalid_configuration("Buffer interface can not be called when buffer builds are disabled."); @@ -1153,6 +1213,7 @@ class committed_descriptor { const std::vector& output_strides, std::size_t input_distance, std::size_t output_distance, std::size_t input_offset, std::size_t output_offset, Scalar scale_factor) { + LOG_FUNCTION_ENTRY(); using TOutConst = std::conditional_t, const std::remove_pointer_t*, const TOut>; std::size_t n_dimensions = params.lengths.size(); std::size_t total_size = params.get_flattened_length(); @@ -1182,6 +1243,7 @@ class committed_descriptor { output_distance = params.lengths.back(); } + LOG_TRACE("Dispatching the kernel for the last dimension"); sycl::event previous_event = dispatch_kernel_1d( in, out, in_imag, out_imag, dependencies, params.number_of_transforms * outer_size, input_stride_0, output_stride_0, input_distance, output_distance, input_offset, output_offset, scale_factor, dimensions.back()); @@ -1196,6 +1258,7 @@ class committed_descriptor { // TODO do everything from the next loop in a single kernel once we support more than one distance in the // kernels. std::size_t stride_between_kernels = inner_size * params.lengths[i]; + LOG_TRACE("Dispatching the kernels for the dimension", i); for (std::size_t j = 0; j < params.number_of_transforms * outer_size; j++) { sycl::event e = dispatch_kernel_1d( out, out, out_imag, out_imag, previous_events, inner_size, inner_size, inner_size, 1, 1, @@ -1242,6 +1305,7 @@ class committed_descriptor { std::size_t input_stride, std::size_t output_stride, std::size_t input_distance, std::size_t output_distance, std::size_t input_offset, std::size_t output_offset, Scalar scale_factor, dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); return dispatch_kernel_1d_helper( in, out, in_imag, out_imag, dependencies, n_transforms, input_stride, output_stride, input_distance, output_distance, input_offset, output_offset, scale_factor, dimension_data); @@ -1282,6 +1346,7 @@ class committed_descriptor { std::size_t output_distance, std::size_t input_offset, std::size_t output_offset, Scalar scale_factor, dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); if (SubgroupSize == dimension_data.used_sg_size) { const bool input_packed = input_distance == dimension_data.length && input_stride == 1; const bool output_packed = output_distance == dimension_data.length && output_stride == 1; @@ -1294,6 +1359,7 @@ class committed_descriptor { kernel_data.level, kernel_data.length, SubgroupSize, kernel_data.factors, kernel_data.num_sgs_per_wg) * sizeof(Scalar); + LOG_TRACE("Local mem required:", minimum_local_mem_required ,"B. Available: ", local_memory_size, "B."); if (static_cast(minimum_local_mem_required) > local_memory_size) { throw out_of_local_memory_error( "Insufficient amount of local memory available: " + std::to_string(local_memory_size) + @@ -1386,6 +1452,7 @@ class committed_descriptor { const std::vector& dependencies, std::size_t n_transforms, std::size_t input_offset, std::size_t output_offset, Scalar scale_factor, dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); // mixing const and non-const inputs leads to hard-to-debug linking errors, as both use the same kernel name, but // are called from different template instantiations. static_assert(!std::is_pointer_v || std::is_const_v>, @@ -1508,6 +1575,7 @@ struct descriptor { */ explicit descriptor(const std::vector& lengths) : lengths(lengths), forward_strides(detail::get_default_strides(lengths)), backward_strides(forward_strides) { + LOG_FUNCTION_ENTRY(); // TODO: properly set default values for distances for real transforms std::size_t total_size = get_flattened_length(); forward_distance = total_size; @@ -1520,7 +1588,10 @@ struct descriptor { * @param queue queue to use for computations * @return committed_descriptor */ - committed_descriptor commit(sycl::queue& queue) { return {*this, queue}; } + committed_descriptor commit(sycl::queue& queue) { + LOG_FUNCTION_ENTRY(); + return {*this, queue}; + } /** * Get the flattened length of an FFT for a single batch, ignoring strides and distance. diff --git a/src/portfft/dispatcher/global_dispatcher.hpp b/src/portfft/dispatcher/global_dispatcher.hpp index cae8d8fd..9ebf73fc 100644 --- a/src/portfft/dispatcher/global_dispatcher.hpp +++ b/src/portfft/dispatcher/global_dispatcher.hpp @@ -46,6 +46,7 @@ namespace detail { */ inline std::pair get_launch_params(IdxGlobal fft_size, IdxGlobal num_batches, detail::level level, Idx n_compute_units, Idx subgroup_size, Idx n_sgs_in_wg) { + LOG_FUNCTION_ENTRY(); IdxGlobal n_available_sgs = 8 * n_compute_units * 64; IdxGlobal wg_size = n_sgs_in_wg * subgroup_size; if (level == detail::level::WORKITEM) { @@ -75,6 +76,7 @@ inline std::pair get_launch_params(IdxGlobal fft_size, Idx */ template void complex_transpose(const T* a, T* b, IdxGlobal lda, IdxGlobal ldb, IdxGlobal num_elements) { + LOG_FUNCTION_ENTRY(); for (IdxGlobal i = 0; i < num_elements; i++) { IdxGlobal j = i / ldb; IdxGlobal k = i % ldb; @@ -90,6 +92,7 @@ void complex_transpose(const T* a, T* b, IdxGlobal lda, IdxGlobal ldb, IdxGlobal * @return value to increment the pointer by */ inline IdxGlobal increment_twiddle_offset(detail::level level, Idx factor_size) { + LOG_FUNCTION_ENTRY(); if (level == detail::level::SUBGROUP) { return 2 * factor_size; } @@ -107,6 +110,7 @@ template template struct committed_descriptor::calculate_twiddles_struct::inner { static Scalar* execute(committed_descriptor& desc, dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); auto& kernels = dimension_data.kernels; std::vector factors_idx_global; // Get factor sizes per level; @@ -142,6 +146,7 @@ struct committed_descriptor::calculate_twiddles_struct::inner host_memory(static_cast(mem_required_for_twiddles)); std::vector scratch_space(static_cast(mem_required_for_twiddles)); + LOG_TRACE("Allocating global memory for twiddles for workgroup implementation. Allocation size", mem_required_for_twiddles); Scalar* device_twiddles = sycl::malloc_device(static_cast(mem_required_for_twiddles), desc.queue); @@ -254,14 +259,21 @@ struct committed_descriptor::set_spec_constants_struct::inner& in_bundle, std::size_t length, const std::vector& factors, detail::level level, Idx factor_num, Idx num_factors) { + LOG_FUNCTION_ENTRY(); Idx length_idx = static_cast(length); + LOG_TRACE("GlobalSubImplSpecConst:", level); in_bundle.template set_specialization_constant(level); + LOG_TRACE("GlobalSpecConstNumFactors:", num_factors); in_bundle.template set_specialization_constant(num_factors); + LOG_TRACE("GlobalSpecConstLevelNum:", factor_num); in_bundle.template set_specialization_constant(factor_num); if (level == detail::level::WORKITEM || level == detail::level::WORKGROUP) { + LOG_TRACE("SpecConstFftSize:", length_idx); in_bundle.template set_specialization_constant(length_idx); } else if (level == detail::level::SUBGROUP) { + LOG_TRACE("SubgroupFactorWISpecConst:", factors[1]); in_bundle.template set_specialization_constant(factors[1]); + LOG_TRACE("SubgroupFactorSGSpecConst:", factors[0]); in_bundle.template set_specialization_constant(factors[0]); } } @@ -273,6 +285,7 @@ struct committed_descriptor::num_scalars_in_local_mem_struct::in Dummy> { static std::size_t execute(committed_descriptor& /*desc*/, std::size_t /*length*/, Idx /*used_sg_size*/, const std::vector& /*factors*/, Idx& /*num_sgs_per_wg*/) { + LOG_FUNCTION_ENTRY(); // No work required as all work done in calculate_twiddles; return 0; } @@ -288,6 +301,7 @@ struct committed_descriptor::run_kernel_struct& dependencies, IdxGlobal n_transforms, IdxGlobal input_offset, IdxGlobal output_offset, Scalar scale_factor, dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); complex_storage storage = desc.params.complex_storage; const IdxGlobal vec_size = storage == complex_storage::INTERLEAVED_COMPLEX ? 2 : 1; const auto& kernels = dimension_data.kernels; @@ -309,9 +323,11 @@ struct committed_descriptor::run_kernel_struct(kernels.at(i).length); } for (std::size_t i = 0; i < num_batches; i += max_batches_in_l2) { + LOG_TRACE("Global implementation working on batches", i, "through", i + max_batches_in_l2, "out of", num_batches); IdxGlobal intermediate_twiddles_offset = 0; IdxGlobal impl_twiddle_offset = initial_impl_twiddle_offset; auto& kernel0 = kernels.at(0); + LOG_TRACE("Dispatching the kernel for factor 0 of global implementation"); l2_events = detail::compute_level( kernel0, in, desc.scratch_ptr_1.get(), in_imag, desc.scratch_ptr_1.get() + imag_offset, twiddles_ptr, @@ -326,7 +342,9 @@ struct committed_descriptor::run_kernel_struct(dimension_data.num_factors); factor_num++) { auto& current_kernel = kernels.at(factor_num); + LOG_TRACE("Dispatching the kernel for factor", factor_num, "of global implementation"); if (static_cast(factor_num) == dimension_data.num_factors - 1) { + LOG_TRACE("This is the last kernel"); l2_events = detail::compute_level( current_kernel, desc.scratch_ptr_1.get(), desc.scratch_ptr_1.get(), @@ -354,6 +372,7 @@ struct committed_descriptor::run_kernel_struct 0; num_transpose--) { + LOG_TRACE("Dispatching the transpose kernel", num_transpose); event = detail::transpose_level( kernels.at(static_cast(num_transpose) + static_cast(num_factors)), desc.scratch_ptr_1.get(), desc.scratch_ptr_2.get(), factors_and_scan, committed_size, @@ -368,6 +387,7 @@ struct committed_descriptor::run_kernel_struct( kernels.at(static_cast(num_factors)), desc.scratch_ptr_1.get(), out, factors_and_scan, committed_size, static_cast(max_batches_in_l2), n_transforms, static_cast(i), num_factors, diff --git a/src/portfft/dispatcher/subgroup_dispatcher.hpp b/src/portfft/dispatcher/subgroup_dispatcher.hpp index e511d280..32b57760 100644 --- a/src/portfft/dispatcher/subgroup_dispatcher.hpp +++ b/src/portfft/dispatcher/subgroup_dispatcher.hpp @@ -47,6 +47,7 @@ namespace detail { template IdxGlobal get_global_size_subgroup(IdxGlobal n_transforms, Idx factor_sg, Idx subgroup_size, Idx num_sgs_per_wg, Idx n_compute_units) { + LOG_FUNCTION_ENTRY(); Idx maximum_n_sgs = 2 * n_compute_units * 64; Idx maximum_n_wgs = maximum_n_sgs / num_sgs_per_wg; Idx wg_size = subgroup_size * num_sgs_per_wg; @@ -581,13 +582,16 @@ template template struct committed_descriptor::calculate_twiddles_struct::inner { static Scalar* execute(committed_descriptor& desc, dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); const auto& kernel_data = dimension_data.kernels.at(0); Idx factor_wi = kernel_data.factors[0]; Idx factor_sg = kernel_data.factors[1]; + LOG_TRACE("Allocating global memory for twiddles for subgroup implementation. Allocation size", kernel_data.length * 2); Scalar* res = sycl::aligned_alloc_device( alignof(sycl::vec), kernel_data.length * 2, desc.queue); sycl::range<2> kernel_range({static_cast(factor_sg), static_cast(factor_wi)}); desc.queue.submit([&](sycl::handler& cgh) { + LOG_TRACE("Launching twiddle calculation kernel for subgroup implementation with global size", factor_sg, factor_wi); cgh.parallel_for(kernel_range, [=](sycl::item<2> it) { Idx n = static_cast(it.get_id(0)); Idx k = static_cast(it.get_id(1)); @@ -610,6 +614,7 @@ struct committed_descriptor::run_kernel_struct& dependencies, IdxGlobal n_transforms, IdxGlobal input_offset, IdxGlobal output_offset, Scalar scale_factor, dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); constexpr detail::memory Mem = std::is_pointer_v ? detail::memory::USM : detail::memory::BUFFER; auto& kernel_data = dimension_data.kernels.at(0); Scalar* twiddles = kernel_data.twiddles_forward.get(); @@ -632,6 +637,7 @@ struct committed_descriptor::run_kernel_struct>( sycl::nd_range<1>{{global_size}, {static_cast(SubgroupSize * kernel_data.num_sgs_per_wg)}}, [=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { @@ -657,7 +663,10 @@ struct committed_descriptor::set_spec_constants_struct::inner& in_bundle, std::size_t /*length*/, const std::vector& factors, detail::level /*level*/, Idx /*factor_num*/, Idx /*num_factors*/) { + LOG_FUNCTION_ENTRY(); + LOG_TRACE("SubgroupFactorWISpecConst:", factors[0]); in_bundle.template set_specialization_constant(factors[0]); + LOG_TRACE("SubgroupFactorSGSpecConst:", factors[1]); in_bundle.template set_specialization_constant(factors[1]); } }; @@ -668,6 +677,7 @@ struct committed_descriptor::num_scalars_in_local_mem_struct::in Dummy> { static std::size_t execute(committed_descriptor& desc, std::size_t length, Idx used_sg_size, const std::vector& factors, Idx& num_sgs_per_wg) { + LOG_FUNCTION_ENTRY(); Idx dft_length = static_cast(length); Idx twiddle_bytes = 2 * dft_length * static_cast(sizeof(Scalar)); if constexpr (LayoutIn == detail::layout::BATCH_INTERLEAVED) { diff --git a/src/portfft/dispatcher/workgroup_dispatcher.hpp b/src/portfft/dispatcher/workgroup_dispatcher.hpp index 84f08dbb..6d6ae4f1 100644 --- a/src/portfft/dispatcher/workgroup_dispatcher.hpp +++ b/src/portfft/dispatcher/workgroup_dispatcher.hpp @@ -63,6 +63,7 @@ PORTFFT_INLINE constexpr Idx get_num_batches_in_local_mem_workgroup(Idx workgrou template IdxGlobal get_global_size_workgroup(IdxGlobal n_transforms, Idx subgroup_size, Idx num_sgs_per_wg, Idx n_compute_units) { + LOG_FUNCTION_ENTRY(); Idx maximum_n_sgs = 8 * n_compute_units * 64; Idx maximum_n_wgs = maximum_n_sgs / num_sgs_per_wg; Idx wg_size = subgroup_size * num_sgs_per_wg; @@ -282,6 +283,7 @@ struct committed_descriptor::run_kernel_struct& dependencies, IdxGlobal n_transforms, IdxGlobal input_offset, IdxGlobal output_offset, Scalar scale_factor, dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); auto& kernel_data = dimension_data.kernels.at(0); Idx num_batches_in_local_mem = [=]() { if constexpr (LayoutIn == detail::layout::BATCH_INTERLEAVED) { @@ -312,8 +314,9 @@ struct committed_descriptor::run_kernel_struct>( - sycl::nd_range<1>{{global_size}, {static_cast(SubgroupSize * PORTFFT_SGS_IN_WG)}}, + sycl::nd_range<1>{{global_size}, {static_cast(SubgroupSize * kernel_data.num_sgs_per_wg)}}, [=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { detail::global_data_struct global_data{ #ifdef PORTFFT_LOG @@ -337,7 +340,9 @@ struct committed_descriptor::set_spec_constants_struct::inner& in_bundle, std::size_t length, const std::vector& /*factors*/, detail::level /*level*/, Idx /*factor_num*/, Idx /*num_factors*/) { + LOG_FUNCTION_ENTRY(); const Idx length_idx = static_cast(length); + LOG_TRACE("SpecConstFftSize:", length_idx); in_bundle.template set_specialization_constant(length_idx); } }; @@ -348,6 +353,7 @@ struct committed_descriptor::num_scalars_in_local_mem_struct::in Dummy> { static std::size_t execute(committed_descriptor& /*desc*/, std::size_t length, Idx used_sg_size, const std::vector& factors, Idx& /*num_sgs_per_wg*/) { + LOG_FUNCTION_ENTRY(); std::size_t n = static_cast(factors[0]) * static_cast(factors[1]); std::size_t m = static_cast(factors[2]) * static_cast(factors[3]); // working memory + twiddles for subgroup impl for the two sizes @@ -363,6 +369,7 @@ template template struct committed_descriptor::calculate_twiddles_struct::inner { static Scalar* execute(committed_descriptor& desc, dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); const auto& kernel_data = dimension_data.kernels.at(0); Idx factor_wi_n = kernel_data.factors[0]; Idx factor_sg_n = kernel_data.factors[1]; @@ -372,10 +379,12 @@ struct committed_descriptor::calculate_twiddles_struct::inner(alignof(sycl::vec), static_cast(res_size), desc.queue); desc.queue.submit([&](sycl::handler& cgh) { + LOG_TRACE("Launching twiddle calculation kernel for factor 1 of workgroup implementation with global size", factor_sg_n, factor_wi_n); cgh.parallel_for(sycl::range<2>({static_cast(factor_sg_n), static_cast(factor_wi_n)}), [=](sycl::item<2> it) { Idx n = static_cast(it.get_id(0)); @@ -384,6 +393,7 @@ struct committed_descriptor::calculate_twiddles_struct::inner({static_cast(factor_sg_m), static_cast(factor_wi_m)}), [=](sycl::item<2> it) { Idx n = static_cast(it.get_id(0)); @@ -392,6 +402,7 @@ struct committed_descriptor::calculate_twiddles_struct::inner({static_cast(n), static_cast(factor_wi_m), static_cast(factor_sg_m)}), [=](sycl::item<3> it) { diff --git a/src/portfft/dispatcher/workitem_dispatcher.hpp b/src/portfft/dispatcher/workitem_dispatcher.hpp index 533fac4d..d9a2bfaa 100644 --- a/src/portfft/dispatcher/workitem_dispatcher.hpp +++ b/src/portfft/dispatcher/workitem_dispatcher.hpp @@ -45,6 +45,7 @@ namespace detail { */ template IdxGlobal get_global_size_workitem(IdxGlobal n_transforms, Idx subgroup_size, Idx num_sgs_per_wg, Idx n_compute_units) { + LOG_FUNCTION_ENTRY(); Idx maximum_n_sgs = 8 * n_compute_units * 64; Idx maximum_n_wgs = maximum_n_sgs / num_sgs_per_wg; Idx wg_size = subgroup_size * num_sgs_per_wg; @@ -281,6 +282,7 @@ struct committed_descriptor::run_kernel_struct& dependencies, IdxGlobal n_transforms, IdxGlobal input_offset, IdxGlobal output_offset, Scalar scale_factor, dimension_struct& dimension_data) { + LOG_FUNCTION_ENTRY(); constexpr detail::memory Mem = std::is_pointer_v ? detail::memory::USM : detail::memory::BUFFER; auto& kernel_data = dimension_data.kernels.at(0); std::size_t local_elements = @@ -299,6 +301,7 @@ struct committed_descriptor::run_kernel_struct>( sycl::nd_range<1>{{global_size}, {static_cast(SubgroupSize * kernel_data.num_sgs_per_wg)}}, [=](sycl::nd_item<1> it, sycl::kernel_handler kh) PORTFFT_REQD_SUBGROUP_SIZE(SubgroupSize) { @@ -324,7 +327,9 @@ struct committed_descriptor::set_spec_constants_struct::inner& in_bundle, std::size_t length, const std::vector& /*factors*/, detail::level /*level*/, Idx /*factor_num*/, Idx /*num_factors*/) { + LOG_FUNCTION_ENTRY(); const Idx length_idx = static_cast(length); + LOG_TRACE("SpecConstFftSize:", length_idx); in_bundle.template set_specialization_constant(length_idx); } }; @@ -335,6 +340,7 @@ struct committed_descriptor::num_scalars_in_local_mem_struct::in Dummy> { static std::size_t execute(committed_descriptor& desc, std::size_t length, Idx used_sg_size, const std::vector& /*factors*/, Idx& num_sgs_per_wg) { + LOG_FUNCTION_ENTRY(); Idx num_scalars_per_sg = detail::pad_local(2 * static_cast(length) * used_sg_size, 1); Idx max_n_sgs = desc.local_memory_size / static_cast(sizeof(Scalar)) / num_scalars_per_sg; num_sgs_per_wg = std::min(Idx(PORTFFT_SGS_IN_WG), std::max(Idx(1), max_n_sgs)); @@ -346,7 +352,10 @@ struct committed_descriptor::num_scalars_in_local_mem_struct::in template template struct committed_descriptor::calculate_twiddles_struct::inner { - static Scalar* execute(committed_descriptor& /*desc*/, dimension_struct& /*dimension_data*/) { return nullptr; } + static Scalar* execute(committed_descriptor& /*desc*/, dimension_struct& /*dimension_data*/) { + LOG_FUNCTION_ENTRY(); + return nullptr; + } }; } // namespace portfft diff --git a/src/portfft/enums.hpp b/src/portfft/enums.hpp index f26db54e..1f7cbb82 100644 --- a/src/portfft/enums.hpp +++ b/src/portfft/enums.hpp @@ -39,7 +39,7 @@ enum class direction { FORWARD, BACKWARD }; constexpr direction inv(direction dir) { return dir == direction::FORWARD ? direction::BACKWARD : direction::FORWARD; } namespace detail { -enum class pad { DO_PAD, DONT_PAD }; +enum class pad { DONT_PAD, DO_PAD }; enum class level { WORKITEM, SUBGROUP, WORKGROUP, GLOBAL }; @@ -66,9 +66,9 @@ enum class transfer_direction { GLOBAL_TO_LOCAL }; -enum class elementwise_multiply { APPLIED, NOT_APPLIED }; +enum class elementwise_multiply { NOT_APPLIED, APPLIED }; -enum class apply_scale_factor { APPLIED, NOT_APPLIED }; +enum class apply_scale_factor { NOT_APPLIED, APPLIED }; } // namespace detail } // namespace portfft diff --git a/src/portfft/utils.hpp b/src/portfft/utils.hpp index f9b086c9..bbe76146 100644 --- a/src/portfft/utils.hpp +++ b/src/portfft/utils.hpp @@ -26,6 +26,7 @@ #include #include +#include "common/logging.hpp" #include "defines.hpp" #include "enums.hpp" @@ -44,6 +45,7 @@ class transpose_kernel; template