From 804a03b76e6b1270327f3f6ddbe58b6ffba5d30e Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Thu, 14 Nov 2024 10:04:07 +0800 Subject: [PATCH] Enable `cow_input_*` test cases (#1067) Resolve issues in https://github.com/intel/torch-xpu-ops/issues/281 --------- Co-authored-by: ZhiweiYan-96 Co-authored-by: Yutao Xu --- .../sycl/AdaptiveAveragePooling2dKernels.cpp | 18 +- .../native/xpu/sycl/AveragePool2dKernels.cpp | 16 +- .../native/xpu/sycl/BucketizationKernels.cpp | 21 +- src/ATen/native/xpu/sycl/Col2ImKernel.cpp | 2 +- src/ATen/native/xpu/sycl/CumprodKernel.cpp | 2 +- src/ATen/native/xpu/sycl/CumsumKernel.cpp | 2 +- src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp | 110 ++++------ src/ATen/native/xpu/sycl/DistanceKernels.cpp | 56 +++--- .../native/xpu/sycl/DistributionTemplates.h | 2 +- src/ATen/native/xpu/sycl/Dropout.cpp | 12 +- src/ATen/native/xpu/sycl/Embedding.cpp | 2 +- .../native/xpu/sycl/EmbeddingBackwardKernel.h | 18 +- src/ATen/native/xpu/sycl/EmbeddingBag.cpp | 188 +++++++++--------- src/ATen/native/xpu/sycl/EmbeddingBag.h | 16 +- src/ATen/native/xpu/sycl/GridSampler.cpp | 122 ++++++------ src/ATen/native/xpu/sycl/Im2ColKernel.cpp | 2 +- src/ATen/native/xpu/sycl/Indexing.cpp | 83 ++++---- src/ATen/native/xpu/sycl/Indexing.h | 16 +- src/ATen/native/xpu/sycl/LayerNormKernels.cpp | 93 ++++----- .../native/xpu/sycl/MaxUnpoolingKernels.cpp | 18 +- src/ATen/native/xpu/sycl/MemoryAccess.h | 7 +- src/ATen/native/xpu/sycl/NonzeroKernel.cpp | 10 +- src/ATen/native/xpu/sycl/Norm.h | 62 +++--- .../native/xpu/sycl/ReflectionPadKernels.cpp | 71 +++---- .../xpu/sycl/ReplicationPaddingKernels.cpp | 93 ++++----- src/ATen/native/xpu/sycl/ScanUtils.h | 8 +- .../native/xpu/sycl/SegmentReduceKernels.cpp | 40 ++-- src/ATen/native/xpu/sycl/Shape.cpp | 4 +- src/ATen/native/xpu/sycl/SoftMaxKernels.cpp | 119 +++++------ src/ATen/native/xpu/sycl/Sorting.cpp | 21 +- src/ATen/native/xpu/sycl/SortingCommon.h | 2 +- src/ATen/native/xpu/sycl/SortingRadixSelect.h | 6 +- .../native/xpu/sycl/SummaryOpsKernels.cpp | 10 +- src/ATen/native/xpu/sycl/TensorModeKernel.cpp | 19 +- .../xpu/sycl/TensorTransformationsKernels.cpp | 6 +- .../native/xpu/sycl/TriangularOpsKernels.cpp | 6 +- src/ATen/native/xpu/sycl/UniqueKernels.cpp | 19 +- .../xpu/sycl/UpSampleBilinear2dKernels.cpp | 12 +- .../xpu/sycl/UpSampleNearest1dKernels.cpp | 8 +- .../xpu/sycl/UpSampleNearest2dKernels.cpp | 16 +- test/xpu/extended/skip_list_common.py | 8 +- 41 files changed, 671 insertions(+), 675 deletions(-) diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp index 86fd7edfe..93c0f8972 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp @@ -61,7 +61,7 @@ struct AdaptiveAvgPool2dBwdKernelFunctor { } AdaptiveAvgPool2dBwdKernelFunctor( - PackedTensorAccessor64 gyacc, + PackedTensorAccessor64 gyacc, PackedTensorAccessor64 gxacc) : gyacc_(gyacc), gxacc_(gxacc) { ib_ = gxacc_.size(0); @@ -97,7 +97,7 @@ struct AdaptiveAvgPool2dBwdKernelFunctor { int64_t numel_; int global_range_; int local_range_; - PackedTensorAccessor64 gyacc_; + PackedTensorAccessor64 gyacc_; PackedTensorAccessor64 gxacc_; }; @@ -183,7 +183,7 @@ struct AdaptiveAvgPool2dBwdSLMKernelFunctor } AdaptiveAvgPool2dBwdSLMKernelFunctor( - PackedTensorAccessor64 gyacc, + PackedTensorAccessor64 gyacc, PackedTensorAccessor64 gxacc) : gyacc_(gyacc), gxacc_(gxacc) { ib_ = gxacc_.size(0); @@ -220,7 +220,7 @@ struct AdaptiveAvgPool2dBwdSLMKernelFunctor int64_t numel_; int local_range_; int global_range_; - PackedTensorAccessor64 gyacc_; + PackedTensorAccessor64 gyacc_; PackedTensorAccessor64 gxacc_; sycl_local_acc_t _oh0_cached_; sycl_local_acc_t _oh1_cached_; @@ -282,7 +282,7 @@ void adaptive_avg_pool2d_backward_kernel( "adaptive_avg_pool2d_backward_xpu", [&]() { using opmath_t = at::opmath_type; - auto gyacc = grad_output.packed_accessor64(); + auto gyacc = grad_output.packed_accessor64(); auto gxacc = grad_input.packed_accessor64(); int64_t ohw01_shared_size = @@ -375,7 +375,7 @@ struct AdaptiveAvgPool2dKernelFunctor { int ow, int64_t numel, int global_range, - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output) : ih_(ih), iw_(iw), @@ -397,13 +397,13 @@ struct AdaptiveAvgPool2dKernelFunctor { int ow_; int64_t numel_; int global_range_; - PackedTensorAccessor64 input_; + PackedTensorAccessor64 input_; PackedTensorAccessor64 output_; }; template void launch_adaptive_avg_pool2d_kernel( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output) { int ih = input.size(2); int iw = input.size(3); @@ -495,7 +495,7 @@ void adaptive_avg_pool2d_kernel( "adaptive_avg_pool2d_xpu", [&]() { using opmath_t = at::opmath_type; - auto iacc = input_.packed_accessor64(); + auto iacc = input_.packed_accessor64(); auto oacc = output.packed_accessor64(); if (is_smf_channels_last(output)) { launch_adaptive_avg_pool2d_kernel( diff --git a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp index e7a2c2e1f..3d1fb123e 100644 --- a/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp @@ -247,8 +247,8 @@ void launch_avg_pool2d_channels_last_kernel( const int divisor_override, const bool count_include_pad, const bool use_divisor) { - scalar_t* top_data = output.data_ptr(); - const scalar_t* bottom_data = input.data_ptr(); + scalar_t* top_data = output.mutable_data_ptr(); + const scalar_t* bottom_data = input.const_data_ptr(); auto& queue = at::xpu::getCurrentSYCLQueue(); const uint32_t group_size = static_cast(syclMaxWorkItemsPerEU()); @@ -295,8 +295,8 @@ void launch_avg_pool2d_kernel( const int divisor_override, const bool count_include_pad, const bool use_divisor) { - scalar_t* top_data = output.data_ptr(); - const scalar_t* bottom_data = input.data_ptr(); + scalar_t* top_data = output.mutable_data_ptr(); + const scalar_t* bottom_data = input.const_data_ptr(); auto& queue = at::xpu::getCurrentSYCLQueue(); const uint32_t group_size = static_cast(syclMaxWorkItemsPerEU()); @@ -552,8 +552,8 @@ void launch_avg_pool2d_backward_channels_last_kernel( const int divisor_override, bool count_include_pad, bool use_divisor) { - const scalar_t* top_data = grad_output.data_ptr(); - scalar_t* bottom_data = grad_input.data_ptr(); + const scalar_t* top_data = grad_output.const_data_ptr(); + scalar_t* bottom_data = grad_input.mutable_data_ptr(); auto& queue = at::xpu::getCurrentSYCLQueue(); const uint32_t group_size = static_cast(syclMaxWorkItemsPerEU()); @@ -603,8 +603,8 @@ void launch_avg_pool2d_backward_kernel( const int divisor_override, bool count_include_pad, bool use_divisor) { - const scalar_t* top_data = grad_output.data_ptr(); - scalar_t* bottom_data = grad_input.data_ptr(); + const scalar_t* top_data = grad_output.const_data_ptr(); + scalar_t* bottom_data = grad_input.mutable_data_ptr(); auto& queue = at::xpu::getCurrentSYCLQueue(); const uint32_t group_size = static_cast(syclMaxWorkItemsPerEU()); diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp index c5686babb..20ae3b53e 100644 --- a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp +++ b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp @@ -91,17 +91,15 @@ struct SearchsortedKernelFunctor { int64_t idim_in, int64_t idim_bd, const int64_t* data_st, - output_t* data_out, bool is_1d_boundaries, - input_t* data_in_data, - input_t* data_bd_data, + const input_t* data_in_data, + const input_t* data_bd_data, output_t* data_out_data) : right_(right), numel_in_(numel_in), idim_in_(idim_in), idim_bd_(idim_bd), data_st_(data_st), - data_out_(data_out), is_1d_boundaries_(is_1d_boundaries), data_in_data_(data_in_data), data_bd_data_(data_bd_data), @@ -113,10 +111,9 @@ struct SearchsortedKernelFunctor { int64_t idim_in_; int64_t idim_bd_; const int64_t* data_st_; - output_t* data_out_; bool is_1d_boundaries_; - input_t* data_in_data_; - input_t* data_bd_data_; + const input_t* data_in_data_; + const input_t* data_bd_data_; output_t* data_out_data_; }; template @@ -133,20 +130,18 @@ void searchsorted_template( int64_t idim_bd = boundaries.sizes().back(); const int64_t* data_st = - sorter.defined() ? sorter.data_ptr() : nullptr; - output_t* data_out = result.data_ptr(); + sorter.defined() ? sorter.const_data_ptr() : nullptr; bool is_1d_boundaries = boundaries.dim() == 1; - auto data_in_data = input.data_ptr(); - auto data_bd_data = boundaries.data_ptr(); - auto data_out_data = result.data_ptr(); + auto data_in_data = input.const_data_ptr(); + auto data_bd_data = boundaries.const_data_ptr(); + auto data_out_data = result.mutable_data_ptr(); SearchsortedKernelFunctor kfn( right, numel_in, idim_in, idim_bd, data_st, - data_out, is_1d_boundaries, data_in_data, data_bd_data, diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp index d52a65fdf..5d67d0db1 100644 --- a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp +++ b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp @@ -236,7 +236,7 @@ void col2im_kernel( output_n = output.select(0, elt); col2im_kernel( - input_n.data_ptr(), + input_n.const_data_ptr(), n_output_plane, output_height, output_width, diff --git a/src/ATen/native/xpu/sycl/CumprodKernel.cpp b/src/ATen/native/xpu/sycl/CumprodKernel.cpp index 4f2930e30..209f27028 100644 --- a/src/ATen/native/xpu/sycl/CumprodKernel.cpp +++ b/src/ATen/native/xpu/sycl/CumprodKernel.cpp @@ -17,7 +17,7 @@ void launch_cumprod_kernel( "cumprod_xpu", [&]() { scalar_t init = 1; - scan( + scan( result, self, dim, init, std::multiplies()); }); } diff --git a/src/ATen/native/xpu/sycl/CumsumKernel.cpp b/src/ATen/native/xpu/sycl/CumsumKernel.cpp index 281c12725..605692654 100644 --- a/src/ATen/native/xpu/sycl/CumsumKernel.cpp +++ b/src/ATen/native/xpu/sycl/CumsumKernel.cpp @@ -17,7 +17,7 @@ void launch_cumsum_kernel( "cumsum_xpu", [&]() { scalar_t init = 0; - scan( + scan( result, self, dim, init, std::plus()); }); } diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index ba0283b8b..e21c0160c 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -95,7 +95,7 @@ struct MaxPool2dKernelFunctor { MaxPool2dKernelFunctor( scalar_t* output, int64_t* indices, - scalar_t* input, + const scalar_t* input, int numPlane, int inputSizeH, int inputSizeW, @@ -133,7 +133,7 @@ struct MaxPool2dKernelFunctor { private: scalar_t* output_; int64_t* indices_; - scalar_t* input_; + const scalar_t* input_; int numPlane_; int inputSizeH_; int inputSizeW_; @@ -181,8 +181,8 @@ struct MaxPool2dBackwardKernelFunctor { } MaxPool2dBackwardKernelFunctor( scalar_t* gradInput, - scalar_t* gradOutput, - int64_t* indices, + const scalar_t* gradOutput, + const int64_t* indices, int numPlane, int gradInputSizeH, int gradInputSizeW, @@ -211,8 +211,8 @@ struct MaxPool2dBackwardKernelFunctor { private: scalar_t* gradInput_; - scalar_t* gradOutput_; - int64_t* indices_; + const scalar_t* gradOutput_; + const int64_t* indices_; int numPlane_; int gradInputSizeH_; int gradInputSizeW_; @@ -280,8 +280,8 @@ struct MaxPool2dBackwardDeterministicKernelFunctor { } MaxPool2dBackwardDeterministicKernelFunctor( scalar_t* gradInput, - scalar_t* gradOutput, - int64_t* indices, + const scalar_t* gradOutput, + const int64_t* indices, int numPlane, int gradInputSizeH, int gradInputSizeW, @@ -326,8 +326,8 @@ struct MaxPool2dBackwardDeterministicKernelFunctor { private: scalar_t* gradInput_; - scalar_t* gradOutput_; - int64_t* indices_; + const scalar_t* gradOutput_; + const int64_t* indices_; int numPlane_; int gradInputSizeH_; int gradInputSizeW_; @@ -353,7 +353,7 @@ template void launch_max_pool2d_kernel( scalar_t* output, int64_t* indices, - scalar_t* input, + const scalar_t* input, int numBatch, int numPlane, int inputSizeH, @@ -400,8 +400,8 @@ void launch_max_pool2d_kernel( template void launch_max_pool2d_backward_kernel( scalar_t* gradInput, - scalar_t* gradOutput, - int64_t* indices, + const scalar_t* gradOutput, + const int64_t* indices, int numBatch, int numPlane, int gradInputSizeH, @@ -498,31 +498,22 @@ void max_pool2d_with_indices_kernel( IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, - const Tensor& output_, - const Tensor& indices_) { + const Tensor& output, + const Tensor& indices) { NoNamesGuard guard; - TensorArg output_arg{output_, "output", 1}; - TensorArg indices_arg{indices_, "indices", 2}; + TensorArg output_arg{output, "output", 1}; + TensorArg indices_arg{indices, "indices", 2}; TensorArg input_arg{input_, "input_", 3}; checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg}); - if (output_.numel() == 0) { + if (output.numel() == 0) { return; } auto smf = input_.suggest_memory_format(); - bool is_3d = input_.ndimension() == 3; - Tensor input, indices, output; - if (is_3d) { - input = input_.contiguous(); - indices = indices_.contiguous(); - output = output_.contiguous(); - } else { - input = input_.contiguous(smf); - indices = indices_.contiguous(smf); - output = output_.contiguous(smf); - } + + Tensor input = input_.contiguous(smf); const int kH = safe_downcast(kernel_size[0]); const int kW = kernel_size.size() == 1 @@ -556,9 +547,9 @@ void max_pool2d_with_indices_kernel( switch (smf) { case MemoryFormat::ChannelsLast: { launch_max_pool2d_kernel( - output.data_ptr(), - indices.data_ptr(), - input.data_ptr(), + output.mutable_data_ptr(), + indices.mutable_data_ptr(), + input.const_data_ptr(), nbatch, nInputPlane, inputHeight, @@ -577,9 +568,9 @@ void max_pool2d_with_indices_kernel( } case MemoryFormat::Contiguous: { launch_max_pool2d_kernel( - output.data_ptr(), - indices.data_ptr(), - input.data_ptr(), + output.mutable_data_ptr(), + indices.mutable_data_ptr(), + input.const_data_ptr(), nbatch, nInputPlane, inputHeight, @@ -602,20 +593,10 @@ void max_pool2d_with_indices_kernel( "Unsupported memory format. Supports only ChannelsLast, Contiguous"); } }); - - if ((is_3d && !indices_.is_contiguous()) || - (!is_3d && !indices_.is_contiguous(smf))) { - indices_.copy_(indices); - } - - if ((is_3d && !output_.is_contiguous()) || - (!is_3d && !output_.is_contiguous(smf))) { - output_.copy_(output); - } } void max_pool2d_with_indices_backward_kernel( - const Tensor& gradInput_, + const Tensor& gradInput, const Tensor& gradOutput_, const Tensor& input_, const Tensor& indices_, @@ -625,7 +606,7 @@ void max_pool2d_with_indices_backward_kernel( IntArrayRef dilation, bool ceil_mode) { NoNamesGuard guard; - TensorArg gradInput_arg{gradInput_, "gradInput", 1}; + TensorArg gradInput_arg{gradInput, "gradInput", 1}; TensorArg gradOutput_arg{gradOutput_, "gradOutput", 2}; TensorArg input_arg{input_, "input", 3}; TensorArg indices_arg{indices_, "indices", 4}; @@ -633,19 +614,11 @@ void max_pool2d_with_indices_backward_kernel( __func__, {gradInput_arg, gradOutput_arg, input_arg, indices_arg}); auto smf = input_.suggest_memory_format(); - bool is_3d = input_.ndimension() == 3; - Tensor input, gradOutput, indices, gradInput; - if (is_3d) { - input = input_.contiguous(); - gradOutput = gradOutput_.contiguous(); - indices = indices_.contiguous(); - gradInput = gradInput_.contiguous(); - } else { - input = input_.contiguous(smf); - gradOutput = gradOutput_.contiguous(smf); - indices = indices_.contiguous(smf); - gradInput = gradInput_.contiguous(smf); - } + Tensor input, gradOutput, indices; + + input = input_.contiguous(smf); + gradOutput = gradOutput_.contiguous(smf); + indices = indices_.contiguous(smf); gradInput.zero_(); const int kH = safe_downcast(kernel_size[0]); @@ -684,9 +657,9 @@ void max_pool2d_with_indices_backward_kernel( switch (smf) { case at::MemoryFormat::ChannelsLast: launch_max_pool2d_backward_kernel( - gradInput.data_ptr(), - gradOutput.data_ptr(), - indices.data_ptr(), + gradInput.mutable_data_ptr(), + gradOutput.const_data_ptr(), + indices.const_data_ptr(), nbatch, nInputPlane, inputHeight, @@ -704,9 +677,9 @@ void max_pool2d_with_indices_backward_kernel( break; case at::MemoryFormat::Contiguous: launch_max_pool2d_backward_kernel( - gradInput.data_ptr(), - gradOutput.data_ptr(), - indices.data_ptr(), + gradInput.mutable_data_ptr(), + gradOutput.const_data_ptr(), + indices.const_data_ptr(), nbatch, nInputPlane, inputHeight, @@ -728,11 +701,6 @@ void max_pool2d_with_indices_backward_kernel( "Unsupported memory format. Supports only ChannelsLast, Contiguous"); } }); - - if ((is_3d && !gradInput_.is_contiguous()) || - (!is_3d && !gradInput_.is_contiguous(smf))) { - gradInput_.copy_(gradInput); - } } } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp index 8990945e5..cd807b1be 100644 --- a/src/ATen/native/xpu/sycl/DistanceKernels.cpp +++ b/src/ATen/native/xpu/sycl/DistanceKernels.cpp @@ -263,10 +263,10 @@ struct CdistForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { const int64_t j = k % r2_; const size_t stride = item_id.get_local_range().size(); - scalar_t* start = x1_ptr + l * l1_size_ + i * m_; - scalar_t* end = start + m_; - scalar_t* a = start + local_id; - scalar_t* b = x2_ptr + l * l2_size_ + j * m_ + local_id; + const scalar_t* const start = x1_ptr + l * l1_size_ + i * m_; + const scalar_t* const end = start + m_; + const scalar_t* a = start + local_id; + const scalar_t* b = x2_ptr + l * l2_size_ + j * m_ + local_id; scalar_t agg = 0.0f; for (; a < end; a += stride, b += stride) { @@ -295,8 +295,8 @@ struct CdistForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { const int64_t l2_size, accscalar_t p_val, scalar_t* out_data, - scalar_t* x1_data, - scalar_t* x2_data, + const scalar_t* x1_data, + const scalar_t* x2_data, const int64_t wgroup_size) : r1_(r1), r2_(r2), @@ -319,8 +319,8 @@ struct CdistForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { const int64_t l2_size_; accscalar_t p_val_; scalar_t* out_data_; - scalar_t* x1_data_; - scalar_t* x2_data_; + const scalar_t* x1_data_; + const scalar_t* x2_data_; sycl_local_acc_t shared_; const int64_t wgroup_size_; }; @@ -341,9 +341,9 @@ static void launch_cdist_forward_kernel( auto wgroup_size = 32; using accscalar_t = acc_type_device; auto p_val = static_cast(p); - auto out_data = result.data_ptr(); - auto x1_data = x1.data_ptr(); - auto x2_data = x2.data_ptr(); + auto out_data = result.mutable_data_ptr(); + auto x1_data = x1.const_data_ptr(); + auto x2_data = x2.const_data_ptr(); CdistForwardKernelFunctor kfn( r1, @@ -493,10 +493,10 @@ struct CdistBackwardKernelImplFunctor { accscalar_t p_val, const int group_num_z, scalar_t* buff_data, - scalar_t* grad_data, - scalar_t* dist_data, - scalar_t* x1_data, - scalar_t* x2_data) + const scalar_t* grad_data, + const scalar_t* dist_data, + const scalar_t* x1_data, + const scalar_t* x2_data) : r1_(r1), r2_(r2), m_(m), @@ -529,10 +529,10 @@ struct CdistBackwardKernelImplFunctor { accscalar_t p_val_; const int group_num_z_; scalar_t* buff_data_; - scalar_t* grad_data_; - scalar_t* dist_data_; - scalar_t* x1_data_; - scalar_t* x2_data_; + const scalar_t* grad_data_; + const scalar_t* dist_data_; + const scalar_t* x1_data_; + const scalar_t* x2_data_; }; template @@ -569,11 +569,11 @@ static void cdist_backward_kernel_impl( sycl::range<3> local_range(group_size_x, group_size_y, 1); sycl::nd_range<3> work_load(global_range, local_range); - auto buff_data = buffer.data_ptr(); - auto grad_data = grad.data_ptr(); - auto dist_data = dist.data_ptr(); - auto x1_data = x1.data_ptr(); - auto x2_data = x2.data_ptr(); + auto buff_data = buffer.mutable_data_ptr(); + auto grad_data = grad.const_data_ptr(); + auto dist_data = dist.const_data_ptr(); + auto x1_data = x1.const_data_ptr(); + auto x2_data = x2.const_data_ptr(); CdistBackwardKernelImplFunctor kfn( r1, @@ -763,7 +763,7 @@ struct PdistKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { accscalar_t n2_val, accscalar_t n2_squared_minus_1_val, scalar_t* out_data, - scalar_t* in_data, + const scalar_t* in_data, const int64_t wgroup_size) : n_(n), m_(m), @@ -781,7 +781,7 @@ struct PdistKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { accscalar_t n2_val_; accscalar_t n2_squared_minus_1_val_; scalar_t* out_data_; - scalar_t* in_data_; + const scalar_t* in_data_; sycl_local_acc_t shared_; const int64_t wgroup_size_; }; @@ -808,8 +808,8 @@ static void pdist_kernel_impl( auto n2_val = static_cast(n2); auto n2_squared_minus_1_val = static_cast(n2_squared_minus_1); - auto out_data = result.data_ptr(); - auto in_data = self.data_ptr(); + auto out_data = result.mutable_data_ptr(); + auto in_data = self.const_data_ptr(); auto kfn = KernelClass( n, diff --git a/src/ATen/native/xpu/sycl/DistributionTemplates.h b/src/ATen/native/xpu/sycl/DistributionTemplates.h index 48b7659c2..4d7643943 100644 --- a/src/ATen/native/xpu/sycl/DistributionTemplates.h +++ b/src/ATen/native/xpu/sycl/DistributionTemplates.h @@ -812,7 +812,7 @@ void bernoulli_tensor_kernel( // element at each time. at::native::xpu::tensor_apply2< scalar_t, - prob_t, + const prob_t, 4, decltype(functor), /*threads_per_group=*/512>(ret, p, functor); diff --git a/src/ATen/native/xpu/sycl/Dropout.cpp b/src/ATen/native/xpu/sycl/Dropout.cpp index 280b31393..fb2a7c50b 100644 --- a/src/ATen/native/xpu/sycl/Dropout.cpp +++ b/src/ATen/native/xpu/sycl/Dropout.cpp @@ -107,7 +107,7 @@ struct FusedDropoutVecFunctor { } } FusedDropoutVecFunctor( - TensorInfo a, + TensorInfo a, TensorInfo b, TensorInfo c, IndexType total_elements, @@ -121,7 +121,7 @@ struct FusedDropoutVecFunctor { philox_args_(philox_args) {} private: - TensorInfo a_; + TensorInfo a_; TensorInfo b_; TensorInfo c_; IndexType total_elements_; @@ -165,7 +165,7 @@ struct FusedDropoutUnrollFunctor { if (li < total_elements_) { // Convert `linearIndex` into an offset of `a` const IndexType aOffset = - IndexToOffset::get(li, a_); + IndexToOffset::get(li, a_); src[ii] = a_.data[aOffset]; } } @@ -182,7 +182,7 @@ struct FusedDropoutUnrollFunctor { } } FusedDropoutUnrollFunctor( - TensorInfo a, + TensorInfo a, TensorInfo b, TensorInfo c, IndexType total_elements, @@ -196,7 +196,7 @@ struct FusedDropoutUnrollFunctor { philox_args_(philox_args) {} private: - TensorInfo a_; + TensorInfo a_; TensorInfo b_; TensorInfo c_; IndexType total_elements_; @@ -278,7 +278,7 @@ inline void launcher( [&] { using accscalar_t = acc_type_device; accscalar_t pa = (accscalar_t)(p); - auto self_info = getTensorInfo(self); + auto self_info = getTensorInfo(self); auto ret_info = getTensorInfo(ret); auto mask_info = getTensorInfo(mask); self_info.collapseDims(); diff --git a/src/ATen/native/xpu/sycl/Embedding.cpp b/src/ATen/native/xpu/sycl/Embedding.cpp index f788c230f..f80b70e3e 100644 --- a/src/ATen/native/xpu/sycl/Embedding.cpp +++ b/src/ATen/native/xpu/sycl/Embedding.cpp @@ -53,7 +53,7 @@ Tensor embedding_dense_backward_kernel( sorted_indices.copy_(indices); pstl::itoa(orig_begin, orig_begin + num_indices, (index_t)0); pstl::sort( - indices.data_ptr(), + indices.const_data_ptr(), sorted_begin, orig_begin, num_indices, diff --git a/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h b/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h index b57673014..41687c4d3 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h +++ b/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h @@ -173,11 +173,11 @@ struct ComputeGradWeightBagsKernelFunctor { int64_t per_sample_weights_stride, acc_type_device* grad_weight_per_segment_data, index_t* indices_data, - scalar_t* gradoutput_data, + const scalar_t* gradoutput_data, index_t* offset2bag_data, index_t* count_data, index_t* bag_size_data, - scalar_t* per_sample_weights_data, + const scalar_t* per_sample_weights_data, index_t* segment_offsets_data) : numel_(numel), stride_(stride), @@ -207,11 +207,11 @@ struct ComputeGradWeightBagsKernelFunctor { int64_t per_sample_weights_stride_; acc_type_device* grad_weight_per_segment_data_; index_t* indices_data_; - scalar_t* gradoutput_data_; + const scalar_t* gradoutput_data_; index_t* offset2bag_data_; index_t* count_data_; index_t* bag_size_data_; - scalar_t* per_sample_weights_data_; + const scalar_t* per_sample_weights_data_; index_t* segment_offsets_data_; }; @@ -238,7 +238,7 @@ void compute_grad_weight_bags( grad_weight_per_segment .template data_ptr>(); auto indices_data = indices.template data_ptr(); - auto gradOutput_data = gradOutput.data_ptr(); + auto gradOutput_data = gradOutput.const_data_ptr(); auto offset2bag_data = offset2bag.data_ptr(); auto count_data = count_defined ? count.data_ptr() @@ -246,7 +246,7 @@ void compute_grad_weight_bags( // buffer. auto bag_size_data = bag_size.data_ptr(); auto per_sample_weights_data = per_sample_weight_defined - ? per_sample_weights.data_ptr() + ? per_sample_weights.const_data_ptr() : gradOutput_data; // ise the gradOutput_data handler as the dummy // buffer. auto segment_offsets_data = segment_offsets.data_ptr(); @@ -323,7 +323,7 @@ struct ComputeGradWeightKernelFunctor { bool count_defined, acc_type_device* grad_weight_per_segment_data, index_t* indices_data, - scalar_t* grad_output_data, + const scalar_t* grad_output_data, index_t* count_data, index_t* segment_offsets_data) : numel_(numel), @@ -345,7 +345,7 @@ struct ComputeGradWeightKernelFunctor { bool count_defined_; acc_type_device* grad_weight_per_segment_data_; index_t* indices_data_; - scalar_t* grad_output_data_; + const scalar_t* grad_output_data_; index_t* count_data_; index_t* segment_offsets_data_; }; @@ -365,7 +365,7 @@ void compute_grad_weight( auto grad_weight_per_segment_data = grad_weight_per_segment.data_ptr>(); auto indices_data = indices.data_ptr(); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); auto count_data = count_defined ? count.data_ptr() : indices_data; // use the indices_data handler as the dummy buffer. diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp index 3d1f8a043..00bbe017e 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp +++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp @@ -37,13 +37,13 @@ template < int vec_size> void embedding_bag( scalar_t* const output, - scalar_t* const weights, - index_t* const index, - index_t* const offset, + const scalar_t* const weights, + const index_t* const index, + const index_t* const offset, index_t* const offset2bag, index_t* const bag_size, index_t* const max_index, - scalar_t* const per_sample_weights, + const scalar_t* const per_sample_weights, int64_t index_size, int64_t bag_num, int64_t vec_len, @@ -63,7 +63,7 @@ void embedding_bag( vec_idx_t>; vec_t* o_vec = reinterpret_cast(output); - vec_t* w_vec = reinterpret_cast(weights); + const vec_t* w_vec = reinterpret_cast(weights); vec_idx_t* max_idx_vec = reinterpret_cast(max_index); vec_len = vec_len / vec_size; @@ -92,73 +92,75 @@ void embedding_bag( cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn); } -#define EMBBAG_KERNEL_ACC( \ - scalar_t, \ - accscalar_t, \ - index_t, \ - mode, \ - vec_size, \ - output, \ - weight, \ - input, \ - offset, \ - offset2bag, \ - bag_size, \ - max_indices, \ - per_sample_weights, \ - index_len, \ - bag_num, \ - vec_len, \ - padding_idx, \ - ignore_offsets) \ - embedding_bag( \ - output.data_ptr(), \ - weight.data_ptr(), \ - indices.data_ptr(), \ - offsets.data_ptr(), \ - offset2bag.data_ptr(), \ - bag_size.data_ptr(), \ - max_indices.data_ptr(), \ - per_sample_weights.defined() ? per_sample_weights.data_ptr() \ - : nullptr, \ - index_size, \ - bag_num, \ - vec_len, \ - padding_idx, \ +#define EMBBAG_KERNEL_ACC( \ + scalar_t, \ + accscalar_t, \ + index_t, \ + mode, \ + vec_size, \ + output, \ + weight, \ + input, \ + offset, \ + offset2bag, \ + bag_size, \ + max_indices, \ + per_sample_weights, \ + index_len, \ + bag_num, \ + vec_len, \ + padding_idx, \ + ignore_offsets) \ + embedding_bag( \ + output.mutable_data_ptr(), \ + weight.const_data_ptr(), \ + indices.const_data_ptr(), \ + offsets.const_data_ptr(), \ + offset2bag.mutable_data_ptr(), \ + bag_size.mutable_data_ptr(), \ + max_indices.mutable_data_ptr(), \ + per_sample_weights.defined() \ + ? per_sample_weights.const_data_ptr() \ + : nullptr, \ + index_size, \ + bag_num, \ + vec_len, \ + padding_idx, \ ignore_offsets) -#define EMBBAG_KERNEL_NO_ACC( \ - scalar_t, \ - index_t, \ - mode, \ - vec_size, \ - output, \ - weight, \ - input, \ - offset, \ - offset2bag, \ - bag_size, \ - max_indices, \ - per_sample_weights, \ - index_len, \ - bag_num, \ - vec_len, \ - padding_idx, \ - ignore_offsets) \ - embedding_bag( \ - output.data_ptr(), \ - weight.data_ptr(), \ - indices.data_ptr(), \ - offsets.data_ptr(), \ - offset2bag.data_ptr(), \ - bag_size.data_ptr(), \ - max_indices.data_ptr(), \ - per_sample_weights.defined() ? per_sample_weights.data_ptr() \ - : nullptr, \ - index_size, \ - bag_num, \ - vec_len, \ - padding_idx, \ +#define EMBBAG_KERNEL_NO_ACC( \ + scalar_t, \ + index_t, \ + mode, \ + vec_size, \ + output, \ + weight, \ + input, \ + offset, \ + offset2bag, \ + bag_size, \ + max_indices, \ + per_sample_weights, \ + index_len, \ + bag_num, \ + vec_len, \ + padding_idx, \ + ignore_offsets) \ + embedding_bag( \ + output.mutable_data_ptr(), \ + weight.const_data_ptr(), \ + indices.const_data_ptr(), \ + offsets.const_data_ptr(), \ + offset2bag.mutable_data_ptr(), \ + bag_size.mutable_data_ptr(), \ + max_indices.mutable_data_ptr(), \ + per_sample_weights.defined() \ + ? per_sample_weights.const_data_ptr() \ + : nullptr, \ + index_size, \ + bag_num, \ + vec_len, \ + padding_idx, \ ignore_offsets) void embedding_bag_sum_template( @@ -206,7 +208,7 @@ void embedding_bag_sum_template( indices.scalar_type(), "embedding_bag_sum_xpu", [&] { using accscalar_t = at::acc_type_device; int vec_size = memory::can_vectorize_up_to( - (char*)weights.data_ptr()); + (char*)weights.const_data_ptr()); vec_size = vec_len % vec_size == 0 ? vec_size : 1; switch (vec_size) { case 8: @@ -272,7 +274,7 @@ void embedding_bag_mean_template( indices.scalar_type(), "embedding_bag_mean_xpu", [&] { using accscalar_t = at::acc_type_device; int vec_size = memory::can_vectorize_up_to( - (char*)weights.data_ptr()); + (char*)weights.const_data_ptr()); vec_size = vec_len % vec_size == 0 ? vec_size : 1; switch (vec_size) { case 8: @@ -337,7 +339,7 @@ void embedding_bag_max_template( indices.scalar_type(), "embedding_bag_max_xpu", [&] { // using accscalar_t = at::acc_type_device; int vec_size = memory::can_vectorize_up_to( - (char*)weights.data_ptr()); + (char*)weights.const_data_ptr()); vec_size = vec_len % vec_size == 0 ? vec_size : 1; switch (vec_size) { case 8: @@ -389,22 +391,26 @@ Tensor embedding_bag_backward_xpu_sum_avg( // int64_t stride = grad_weight.stride(0); auto sorted_indices = at::empty_like(indices); - auto sorted_begin = sorted_indices.data_ptr(); + auto sorted_begin = sorted_indices.mutable_data_ptr(); auto orig_indices = at::empty_like(indices); - auto orig_begin = orig_indices.data_ptr(); + auto orig_begin = orig_indices.mutable_data_ptr(); // directly { sorted_indices.copy_(indices); pstl::itoa(orig_begin, orig_begin + numel, (index_t)0); pstl::sort( - indices.data_ptr(), sorted_begin, orig_begin, numel, false); + indices.const_data_ptr(), + sorted_begin, + orig_begin, + numel, + false); } Tensor count; if (scale_grad_by_freq) { count = at::empty_like(sorted_indices); - index_t* count_begin = count.data_ptr(); + index_t* count_begin = count.mutable_data_ptr(); // Take the maximum of each count per unique key: // sorted: 2 5 5 5 7 7 8 9 9 // count: 1 3 3 3 2 2 1 2 2 @@ -456,8 +462,8 @@ struct EmbeddingBagAccGradParametersKernelMaxFunctor { } } EmbeddingBagAccGradParametersKernelMaxFunctor( - index_t* max_indices_data, - scalar_t* gradOutput_data, + const index_t* max_indices_data, + const scalar_t* gradOutput_data, scalar_t* gradWeight_data, int64_t stride, int64_t chunksPerBag, @@ -470,8 +476,8 @@ struct EmbeddingBagAccGradParametersKernelMaxFunctor { numChunks_(numChunks) {} private: - index_t* max_indices_data_; - scalar_t* gradOutput_data_; + const index_t* max_indices_data_; + const scalar_t* gradOutput_data_; scalar_t* gradWeight_data_; int64_t stride_; int64_t chunksPerBag_; @@ -480,8 +486,8 @@ struct EmbeddingBagAccGradParametersKernelMaxFunctor { template void EmbeddingBag_accGradParametersKernel_max( - index_t* max_indices, - scalar_t* gradOutput, + const index_t* max_indices, + const scalar_t* gradOutput, scalar_t* gradWeight, int64_t stride, int64_t numBags) { @@ -519,9 +525,9 @@ Tensor embedding_bag_backward_xpu_max( int64_t numBags = grad.size(0); EmbeddingBag_accGradParametersKernel_max( - max_indices.data_ptr(), - grad.data_ptr(), - grad_weight.data_ptr(), + max_indices.const_data_ptr(), + grad.const_data_ptr(), + grad_weight.mutable_data_ptr(), stride, numBags); @@ -764,17 +770,17 @@ Tensor _embedding_bag_per_sample_weights_backward_kernel( _embedding_bag_per_sample_weights_backward_impl< scalar_t, index_t>( - grad.data_ptr(), + grad.const_data_ptr(), grad.stride(0), grad.stride(1), - weight.data_ptr(), + weight.const_data_ptr(), weight.stride(0), weight.stride(1), - indices.data_ptr(), - offset2bag.data_ptr(), + indices.const_data_ptr(), + offset2bag.const_data_ptr(), num_samples, embedding_features, - output.data_ptr(), + output.mutable_data_ptr(), padding_idx); }); }); diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.h b/src/ATen/native/xpu/sycl/EmbeddingBag.h index aa36ff665..08caf60b6 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBag.h +++ b/src/ATen/native/xpu/sycl/EmbeddingBag.h @@ -134,19 +134,19 @@ struct EmbeddingBagKernelFunctor { } while (cfg_.next(item, desc)); } EmbeddingBagKernelFunctor( - index_t* const index, - index_t* const offset, + const index_t* const index, + const index_t* const offset, index_t* const offset2bag, index_t* const bag_size, index_t* const max_index, - scalar_t* const per_sample_weights, + const scalar_t* const per_sample_weights, int64_t index_size, int64_t bag_num, int64_t vec_len, index_t padding_idx, bool ignore_offsets, vec_t* o_vec, - vec_t* w_vec, + const vec_t* w_vec, vec_idx_t* max_idx_vec, BatchKernelConfig cfg, index_t fixing_bag_size) @@ -168,19 +168,19 @@ struct EmbeddingBagKernelFunctor { fixing_bag_size_(fixing_bag_size) {} private: - index_t* const index_; - index_t* const offset_; + const index_t* const index_; + const index_t* const offset_; index_t* const offset2bag_; index_t* const bag_size_; index_t* const max_index_; - scalar_t* const per_sample_weights_; + const scalar_t* const per_sample_weights_; int64_t index_size_; int64_t bag_num_; int64_t vec_len_; index_t padding_idx_; bool ignore_offsets_; vec_t* o_vec_; - vec_t* w_vec_; + const vec_t* w_vec_; vec_idx_t* max_idx_vec_; BatchKernelConfig cfg_; index_t fixing_bag_size_; diff --git a/src/ATen/native/xpu/sycl/GridSampler.cpp b/src/ATen/native/xpu/sycl/GridSampler.cpp index c2b51d581..1841dad64 100644 --- a/src/ATen/native/xpu/sycl/GridSampler.cpp +++ b/src/ATen/native/xpu/sycl/GridSampler.cpp @@ -170,8 +170,8 @@ struct GridSampler2dKernelFunctor { } GridSampler2dKernelFunctor( const index_t nthreads, - TensorInfo input, - TensorInfo grid, + TensorInfo input, + TensorInfo grid, TensorInfo output, const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, @@ -220,8 +220,8 @@ struct GridSampler2dKernelFunctor { private: const index_t nthreads_; - TensorInfo input_; - TensorInfo grid_; + TensorInfo input_; + TensorInfo grid_; TensorInfo output_; const GridSamplerInterpolation interpolation_mode_; const GridSamplerPadding padding_mode_; @@ -248,8 +248,8 @@ struct GridSampler2dKernelFunctor { template void grid_sampler_2d_forward_template( const index_t nthreads, - TensorInfo input, - TensorInfo grid, + TensorInfo input, + TensorInfo grid, TensorInfo output, const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, @@ -334,8 +334,8 @@ Tensor grid_sampler_2d_kernel( canUse32BitIndexMath(output)) { grid_sampler_2d_forward_template( static_cast(count), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(input), + getTensorInfo(grid), getTensorInfo(output), static_cast(interpolation_mode), static_cast(padding_mode), @@ -343,8 +343,8 @@ Tensor grid_sampler_2d_kernel( } else { grid_sampler_2d_forward_template( count, - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(input), + getTensorInfo(grid), getTensorInfo(output), static_cast(interpolation_mode), static_cast(padding_mode), @@ -395,15 +395,15 @@ struct GridSampler2dBackwardKernelFunctor { scalar_t se = (ix - ix_nw) * (iy - iy_nw); scalar_t gix = static_cast(0), giy = static_cast(0); - scalar_t* gOut_ptr_NCHW = + const scalar_t* gOut_ptr_NCHW = grad_output_.data + n * gOut_sN_ + h * gOut_sH_ + w * gOut_sW_; index_t NC_offset = n * gInp_sN_; - scalar_t* inp_ptr_NC = input_.data + n * inp_sN_; + const scalar_t* inp_ptr_NC = input_.data + n * inp_sN_; for (index_t c = 0; c < C_; ++c, inp_ptr_NC += inp_sC_, NC_offset += gInp_sC_, gOut_ptr_NCHW += gOut_sC_) { - scalar_t gOut = *gOut_ptr_NCHW; + const scalar_t gOut = *gOut_ptr_NCHW; if (input_requires_grad_) { // calculate and set grad_input @@ -485,7 +485,7 @@ struct GridSampler2dBackwardKernelFunctor { index_t iy_nearest = static_cast(std::nearbyint(iy)); // assign nearest neighor pixel value to output pixel - scalar_t* gOut_ptr_NCHW = + const scalar_t* gOut_ptr_NCHW = grad_output_.data + n * gOut_sN_ + h * gOut_sH_ + w * gOut_sW_; index_t NC_offset = n * gInp_sN_; for (index_t c = 0; c < C_; @@ -536,16 +536,16 @@ struct GridSampler2dBackwardKernelFunctor { scalar_t gix = static_cast(0); scalar_t giy = static_cast(0); - scalar_t* gOut_ptr_NCHW = + const scalar_t* gOut_ptr_NCHW = grad_output_.data + n * gOut_sN_ + h * gOut_sH_ + w * gOut_sW_; index_t NC_offset = n * gInp_sN_; - scalar_t* inp_ptr_NC = input_.data + n * inp_sN_; + const scalar_t* inp_ptr_NC = input_.data + n * inp_sN_; for (index_t c = 0; c < C_; ++c, gOut_ptr_NCHW += gOut_sC_, NC_offset += gInp_sC_, inp_ptr_NC += inp_sC_) { - scalar_t gOut = *gOut_ptr_NCHW; + const scalar_t gOut = *gOut_ptr_NCHW; #pragma unroll 4 for (index_t i = 0; i < 4; ++i) { @@ -591,9 +591,9 @@ struct GridSampler2dBackwardKernelFunctor { } GridSampler2dBackwardKernelFunctor( const index_t nthreads, - TensorInfo grad_output, - TensorInfo input, - TensorInfo grid, + TensorInfo grad_output, + TensorInfo input, + TensorInfo grid, TensorInfo grad_input, TensorInfo grad_grid, const GridSamplerInterpolation interpolation_mode, @@ -657,9 +657,9 @@ struct GridSampler2dBackwardKernelFunctor { private: const index_t nthreads_; - TensorInfo grad_output_; - TensorInfo input_; - TensorInfo grid_; + TensorInfo grad_output_; + TensorInfo input_; + TensorInfo grid_; TensorInfo grad_input_; TensorInfo grad_grid_; const GridSamplerInterpolation interpolation_mode_; @@ -693,9 +693,9 @@ struct GridSampler2dBackwardKernelFunctor { template void grid_sampler_2d_backward_template( const index_t nthreads, - TensorInfo grad_output, - TensorInfo input, - TensorInfo grid, + TensorInfo grad_output, + TensorInfo input, + TensorInfo grid, TensorInfo grad_input, // initialized to zeros // (or unused if input_requires_grad is false) TensorInfo grad_grid, // initialized to empty @@ -809,9 +809,9 @@ void grid_sampler_2d_backward_kernel( canUse32BitIndexMath(grad_output)) { grid_sampler_2d_backward_template( static_cast(count), - getTensorInfo(grad_output), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(grad_output), + getTensorInfo(input), + getTensorInfo(grid), input_requires_grad ? getTensorInfo(grad_input) : TensorInfo(), getTensorInfo(grad_grid), @@ -822,9 +822,9 @@ void grid_sampler_2d_backward_kernel( } else { grid_sampler_2d_backward_template( count, - getTensorInfo(grad_output), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(grad_output), + getTensorInfo(input), + getTensorInfo(grid), input_requires_grad ? getTensorInfo(grad_input) : TensorInfo(), @@ -999,8 +999,8 @@ struct GridSampler3dKernelFunctor { } GridSampler3dKernelFunctor( const index_t nthreads, - TensorInfo input, - TensorInfo grid, + TensorInfo input, + TensorInfo grid, TensorInfo output, const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, @@ -1059,8 +1059,8 @@ struct GridSampler3dKernelFunctor { private: const index_t nthreads_; - TensorInfo input_; - TensorInfo grid_; + TensorInfo input_; + TensorInfo grid_; TensorInfo output_; const GridSamplerInterpolation interpolation_mode_; const GridSamplerPadding padding_mode_; @@ -1092,8 +1092,8 @@ struct GridSampler3dKernelFunctor { template void grid_sampler_3d_forward_template( const index_t nthreads, - TensorInfo input, - TensorInfo grid, + TensorInfo input, + TensorInfo grid, TensorInfo output, const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, @@ -1191,8 +1191,8 @@ Tensor grid_sampler_3d_kernel( canUse32BitIndexMath(output)) { grid_sampler_3d_forward_template( static_cast(count), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(input), + getTensorInfo(grid), getTensorInfo(output), static_cast(interpolation_mode), static_cast(padding_mode), @@ -1200,8 +1200,8 @@ Tensor grid_sampler_3d_kernel( } else { grid_sampler_3d_forward_template( count, - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(input), + getTensorInfo(grid), getTensorInfo(output), static_cast(interpolation_mode), static_cast(padding_mode), @@ -1288,16 +1288,16 @@ struct GridSampler3dBackwardKernelFunctor { scalar_t gix = static_cast(0), giy = static_cast(0), giz = static_cast(0); - scalar_t* gOut_ptr_NCDHW = grad_output_.data + n * gOut_sN_ + + const scalar_t* gOut_ptr_NCDHW = grad_output_.data + n * gOut_sN_ + d * gOut_sD_ + h * gOut_sH_ + w * gOut_sW_; index_t NC_offset = n * gInp_sN_; - scalar_t* inp_ptr_NC = input_.data + n * inp_sN_; + const scalar_t* inp_ptr_NC = input_.data + n * inp_sN_; // calculate bilinear weighted pixel value and set output pixel for (index_t c = 0; c < C_; ++c, gOut_ptr_NCDHW += gOut_sC_, NC_offset += gInp_sC_, inp_ptr_NC += inp_sC_) { - scalar_t gOut = *gOut_ptr_NCDHW; + const scalar_t gOut = *gOut_ptr_NCDHW; if (input_requires_grad_) { // calculate and set grad_input_ @@ -1482,7 +1482,7 @@ struct GridSampler3dBackwardKernelFunctor { auto iz_nearest = static_cast(std::round(iz)); // assign nearest neighor pixel value to output pixel - scalar_t* gOut_ptr_NCDHW = grad_output_.data + n * gOut_sN_ + + const scalar_t* gOut_ptr_NCDHW = grad_output_.data + n * gOut_sN_ + d * gOut_sD_ + h * gOut_sH_ + w * gOut_sW_; index_t NC_offset = n * gInp_sN_; for (index_t c = 0; c < C_; @@ -1517,9 +1517,9 @@ struct GridSampler3dBackwardKernelFunctor { } GridSampler3dBackwardKernelFunctor( const index_t nthreads, - TensorInfo grad_output, - TensorInfo input, - TensorInfo grid, + TensorInfo grad_output, + TensorInfo input, + TensorInfo grid, TensorInfo grad_input, TensorInfo grad_grid, const GridSamplerInterpolation interpolation_mode, @@ -1595,9 +1595,9 @@ struct GridSampler3dBackwardKernelFunctor { private: const index_t nthreads_; - TensorInfo grad_output_; - TensorInfo input_; - TensorInfo grid_; + TensorInfo grad_output_; + TensorInfo input_; + TensorInfo grid_; TensorInfo grad_input_; TensorInfo grad_grid_; const GridSamplerInterpolation interpolation_mode_; @@ -1637,9 +1637,9 @@ struct GridSampler3dBackwardKernelFunctor { template void grid_sampler_3d_backward_template( const index_t nthreads, - TensorInfo grad_output, - TensorInfo input, - TensorInfo grid, + TensorInfo grad_output, + TensorInfo input, + TensorInfo grid, TensorInfo grad_input, // initialized to zeros // (or unused if input_requires_grad is false) TensorInfo grad_grid, // initialized to empty @@ -1768,9 +1768,9 @@ void grid_sampler_3d_backward_kernel( canUse32BitIndexMath(grad_output)) { grid_sampler_3d_backward_template( static_cast(count), - getTensorInfo(grad_output), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(grad_output), + getTensorInfo(input), + getTensorInfo(grid), input_requires_grad ? getTensorInfo(grad_input) : TensorInfo(), getTensorInfo(grad_grid), @@ -1781,9 +1781,9 @@ void grid_sampler_3d_backward_kernel( } else { grid_sampler_3d_backward_template( count, - getTensorInfo(grad_output), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(grad_output), + getTensorInfo(input), + getTensorInfo(grid), input_requires_grad ? getTensorInfo(grad_input) : TensorInfo(), diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp index 9d60bfb25..c37e22983 100644 --- a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp +++ b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp @@ -221,7 +221,7 @@ void im2col_kernel( output_n = output.select(0, elt); im2col_kernel( - input_n.data_ptr(), + input_n.const_data_ptr(), n_input_plane, input_height, input_width, diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp index 22d10bd23..d429ecfbe 100644 --- a/src/ATen/native/xpu/sycl/Indexing.cpp +++ b/src/ATen/native/xpu/sycl/Indexing.cpp @@ -56,7 +56,7 @@ class IndexSelectScalarFunctor { public: void operator()( ValType* dst, - ValType* src, + const ValType* src, int64_t dst_off, int64_t src_off, int64_t idx, @@ -75,7 +75,7 @@ static inline void _index_select_kernel( DstInfo& dst_info, IdxInfo& index_info, int64_t dim) { - using scalar_t = typename SrcInfo::scalar_t; + using scalar_t = typename DstInfo::scalar_t; using IdxConfig = IndexKernelConfig< SrcInfo, DstInfo, @@ -157,8 +157,8 @@ void index_select_kernel( "index_select(): Source and result must have the same scalar type"); AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "index_select", [&] { - TensorInfo index_info = - tensorInfoIfScalar(getTensorInfo(indices)); + TensorInfo index_info = + tensorInfoIfScalar(getTensorInfo(indices)); index_info.collapseDims(); auto new_size = src.sizes().vec(); @@ -180,13 +180,13 @@ void index_select_kernel( AT_WRAP([&] { TensorInfo dst_info = tensorInfoIfScalar(getTensorInfo(dst)); - TensorInfo src_info = tensorInfoIfScalar( - getTensorInfo(src.contiguous())); + TensorInfo src_info = tensorInfoIfScalar( + getTensorInfo(src.contiguous())); int new_indexing_dim = src_info.collapseDims(dim); - using SrcInfo = TensorInfo; + using SrcInfo = TensorInfo; using DstInfo = TensorInfo; - using IdxInfo = TensorInfo; + using IdxInfo = TensorInfo; // Improve efficiency of generated native instructions for contiguous. // See comm/TensorInfo.h @@ -302,7 +302,7 @@ template struct IndexAddScalarFunctor { void operator()( ValType* dst, - ValType* src, + const ValType* src, int64_t dst_off, int64_t src_off, int64_t idx, @@ -315,7 +315,7 @@ template <> struct IndexAddScalarFunctor { void operator()( bool* dst, - bool* src, + const bool* src, int64_t dst_off, int64_t src_off, int64_t idx, @@ -393,12 +393,12 @@ void index_add_kernel( "index_add_xpu", [&] { AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_xpu", [&]() { - TensorInfo index_info = - getTensorInfo(index); + TensorInfo index_info = + getTensorInfo(index); index_info.collapseDims(); - TensorInfo src_info = - getTensorInfo(source_); + TensorInfo src_info = + getTensorInfo(source_); TensorInfo dst_info = getTensorInfo(self_); @@ -428,7 +428,7 @@ template struct IndexFillScalarFunctor { void operator()( ValType* dst, - ValType* src, + const ValType* src, int64_t dst_off, int64_t src_off, int64_t idx, @@ -479,8 +479,8 @@ void index_fill_kernel( self_restrided.scalar_type(), "index_fill_xpu", [&] { - TensorInfo index_info = - getTensorInfo(index); + TensorInfo index_info = + getTensorInfo(index); index_info.collapseDims(); TensorInfo dst_info = @@ -488,7 +488,7 @@ void index_fill_kernel( int new_indexing_dim = dst_info.collapseDims(dim); // No used in index kernel frame for index_fill. - auto src_info = TensorInfo(); + auto src_info = TensorInfo(); using IdxConfig = IndexKernelConfig< decltype(src_info), @@ -511,18 +511,25 @@ void index_fill_kernel( template struct IndexPutAccumulateFunctor { - void operator()(char* out_data, char* in_data, int64_t offset) const { - sycl_global_ptr out_ptr = - sycl_global_ptr((scalar_t*)(out_data + offset)); - auto in = *(scalar_t*)in_data; + void operator()( + char* const out_data, + const char* const in_data, + int64_t offset) const { + sycl_global_ptr out_ptr = sycl_global_ptr( + reinterpret_cast(out_data + offset)); + auto in = *reinterpret_cast(in_data); atomicAdd(out_ptr, in); } }; template struct IndexPutFunctor { - void operator()(char* out_data, char* in_data, int64_t offset) const { - *(scalar_t*)(out_data + offset) = *(scalar_t*)in_data; + void operator()( + char* const out_data, + const char* const in_data, + int64_t offset) const { + *reinterpret_cast(out_data + offset) = + *reinterpret_cast(in_data); } }; @@ -627,7 +634,7 @@ void index_put_deterministic_kernel( orig_indices.data_ptr() + linearIndex.numel(), (int64_t)0); pstl::sort( - linearIndex.data_ptr(), + linearIndex.const_data_ptr(), sorted_indices.data_ptr(), orig_indices.data_ptr(), linearIndex.numel(), @@ -647,10 +654,10 @@ void index_put_deterministic_kernel( "index_put_deterministic_kernel", [&] { launch_index_put_deterministic_kernel( - sorted_indices.data_ptr(), - orig_indices.data_ptr(), - expandedValue.data_ptr(), - src_.data_ptr(), + sorted_indices.mutable_data_ptr(), + orig_indices.mutable_data_ptr(), + expandedValue.const_data_ptr(), + src_.mutable_data_ptr(), num_indices, sliceSize, strideBefore, @@ -758,7 +765,7 @@ class IndexCopyScalarFunctor { public: void operator()( ValType* dst, - ValType* src, + const ValType* src, int64_t dst_off, int64_t src_off, int64_t idx, @@ -773,7 +780,7 @@ static inline void _index_copy_kernel( DstInfo& dst_info, IdxInfo& index_info, int64_t dim) { - using scalar_t = typename SrcInfo::scalar_t; + using scalar_t = typename DstInfo::scalar_t; using IdxConfig = IndexKernelConfig< SrcInfo, DstInfo, @@ -823,12 +830,12 @@ static inline void index_copy_impl( return; } - TensorInfo indices_info = - getTensorInfo(indices); + TensorInfo indices_info = + getTensorInfo(indices); indices_info.collapseDims(); - TensorInfo src_info = - getTensorInfo(source); + TensorInfo src_info = + getTensorInfo(source); TensorInfo dst_info = getTensorInfo(dst); @@ -1056,10 +1063,10 @@ struct TakeFunctor { void operator()(scalar_t& iterated, const index_t offset) const { iterated = indexed_ptr_[offset]; } - TakeFunctor(scalar_t* indexed_ptr) : indexed_ptr_(indexed_ptr) {} + TakeFunctor(const scalar_t* indexed_ptr) : indexed_ptr_(indexed_ptr) {} private: - scalar_t* indexed_ptr_; + const scalar_t* indexed_ptr_; }; void take_kernel(TensorIterator& iter, const TensorBase& input) { @@ -1074,7 +1081,7 @@ void take_kernel(TensorIterator& iter, const TensorBase& input) { canUse32BitIndexMath(input) ? ScalarType::Int : ScalarType::Long, "take_xpu_index", [&] { - scalar_t* indexed_ptr = input.template data_ptr(); + const scalar_t* indexed_ptr = input.template const_data_ptr(); TakeFunctor f(indexed_ptr); take_put_kernel_template(iter, input, f); }); diff --git a/src/ATen/native/xpu/sycl/Indexing.h b/src/ATen/native/xpu/sycl/Indexing.h index 4343a7bfe..d142b2901 100644 --- a/src/ATen/native/xpu/sycl/Indexing.h +++ b/src/ATen/native/xpu/sycl/Indexing.h @@ -28,7 +28,7 @@ TensorInfo tensorInfoIfScalar(TensorInfo ti) { template class IndexKernelConfig : public BatchKernelConfig { public: - using ValType = typename SrcInfo::scalar_t; + using ValType = typename DstInfo::scalar_t; using IdxType = typename IdxInfo::scalar_t; IndexKernelConfig() = delete; @@ -325,17 +325,17 @@ class IndexKernel { cfg_.dinfo_, IndexToOffset::NON_STRICT_CONTIGUOUS); if (cfg_.sinfo_.data != nullptr) { - src_off = IndexToOffset::get( + src_off = IndexToOffset::get( glb_fixing_logical_off, cfg_.sinfo_, - IndexToOffset::NON_STRICT_CONTIGUOUS); + IndexToOffset::NON_STRICT_CONTIGUOUS); } } else { // index_select - src_off = IndexToOffset::get( + src_off = IndexToOffset::get( glb_indexing_logical_off, cfg_.sinfo_, - IndexToOffset::NON_STRICT_CONTIGUOUS); + IndexToOffset::NON_STRICT_CONTIGUOUS); dst_off = IndexToOffset::get( glb_fixing_logical_off, cfg_.dinfo_, @@ -822,7 +822,7 @@ struct IndexPutDeterministicKernelFunctor { IndexPutDeterministicKernelFunctor( int64_t* sorted_indices, int64_t* indices, - scalar_t* value, + const scalar_t* value, scalar_t* self, int64_t stride, int64_t stride_before, @@ -842,7 +842,7 @@ struct IndexPutDeterministicKernelFunctor { private: int64_t* sorted_indices_; int64_t* indices_; - scalar_t* value_; + const scalar_t* value_; scalar_t* self_; int64_t stride_; int64_t stride_before_; @@ -855,7 +855,7 @@ template void launch_index_put_deterministic_kernel( int64_t* sorted_indices, int64_t* indices, - scalar_t* value, + const scalar_t* value, scalar_t* self, int64_t numel, int64_t stride, diff --git a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp index 6a4efc440..171c54736 100644 --- a/src/ATen/native/xpu/sycl/LayerNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/LayerNormKernels.cpp @@ -22,12 +22,12 @@ class LayerNormForward : public NormForward { typedef NormForward NF; LayerNormForward() = delete; LayerNormForward( - scalar_t* X_data, + const scalar_t* X_data, scalar_t* Y_data, mean_t* mean_data, mean_t* var_data, - weight_t* gamma_data, - weight_t* beta_data, + const weight_t* gamma_data, + const weight_t* beta_data, accscalar_t eps, int64_t M, int64_t N) @@ -73,17 +73,17 @@ class LayerNormForward : public NormForward { j += cfg.workgroup_size * vec_size) { index_t plane_offset = group_id_foreach * cfg.workgroup_work_size + j; if (plane_offset < (index_t)cfg.problem_size) { - vec_t X_val = *( - reinterpret_cast(NF::X_data + group_offset + plane_offset)); + vec_t X_val = *(reinterpret_cast( + NF::X_data + group_offset + plane_offset)); weight_vec_t gamma_val, beta_val; vec_t Y_val; if (NF::gamma_data != nullptr) { - gamma_val = - *(reinterpret_cast(NF::gamma_data + plane_offset)); + gamma_val = *(reinterpret_cast( + NF::gamma_data + plane_offset)); } if (NF::beta_data != nullptr) { - beta_val = - *(reinterpret_cast(NF::beta_data + plane_offset)); + beta_val = *(reinterpret_cast( + NF::beta_data + plane_offset)); } for (int v = 0; v < vec_size; ++v) { @@ -120,12 +120,12 @@ class LayerNormBackward : public NormBackward { using accscalar_t = acc_type_device; LayerNormBackward() = delete; LayerNormBackward( - scalar_t* X_data, - scalar_t* dY_data, + const scalar_t* X_data, + const scalar_t* dY_data, scalar_t* dX_data, - mean_t* mean_data, - mean_t* var_data, - weight_t* gamma_data, + const mean_t* mean_data, + const mean_t* var_data, + const weight_t* gamma_data, int64_t M, int64_t N) : NormBackward( @@ -143,12 +143,12 @@ class LayerNormBackward : public NormBackward { } LayerNormBackward( - scalar_t* X_data, - scalar_t* dY_data, + const scalar_t* X_data, + const scalar_t* dY_data, scalar_t* dX_data, - mean_t* mean_data, - mean_t* var_data, - weight_t* gamma_data, + const mean_t* mean_data, + const mean_t* var_data, + const weight_t* gamma_data, accscalar_t* a_data, accscalar_t* b_data, int64_t M, @@ -190,13 +190,13 @@ class LayerNormBackward : public NormBackward { if (plane_offset < cfg.problem_size) { weight_vec_t gamma_val; if (NB::gamma_data != nullptr) { - gamma_val = - *(reinterpret_cast(NB::gamma_data + plane_offset)); + gamma_val = *(reinterpret_cast( + NB::gamma_data + plane_offset)); } - vec_t dY_val = *(reinterpret_cast( + vec_t dY_val = *(reinterpret_cast( NB::dY_data + group_offset + plane_offset)); - vec_t X_val = *( - reinterpret_cast(NB::X_data + group_offset + plane_offset)); + vec_t X_val = *(reinterpret_cast( + NB::X_data + group_offset + plane_offset)); for (int v = 0; v < vec_size; ++v) { accscalar_t value = (NB::gamma_data == nullptr) ? static_cast(dY_val[v]) @@ -239,14 +239,14 @@ class LayerNormBackward : public NormBackward { j += cfg.workgroup_size * vec_size) { index_t plane_offset = group_id_foreach * cfg.workgroup_work_size + j; if (plane_offset < (index_t)cfg.problem_size) { - vec_t dY_val = *(reinterpret_cast( + vec_t dY_val = *(reinterpret_cast( NB::dY_data + group_offset + plane_offset)); - vec_t X_val = *( - reinterpret_cast(NB::X_data + group_offset + plane_offset)); + vec_t X_val = *(reinterpret_cast( + NB::X_data + group_offset + plane_offset)); weight_vec_t gamma_val; if (NB::gamma_data != nullptr) { - gamma_val = - *(reinterpret_cast(NB::gamma_data + plane_offset)); + gamma_val = *(reinterpret_cast( + NB::gamma_data + plane_offset)); } vec_t dX_val; @@ -283,12 +283,14 @@ void _layer_norm_kernel( TORCH_CHECK(!gamma.defined() || gamma.numel() == N); TORCH_CHECK(!beta.defined() || beta.numel() == N); - scalar_t* X_data = X.data_ptr(); + const scalar_t* X_data = X.const_data_ptr(); scalar_t* Y_data = Y.data_ptr(); mean_t* mean_data = mean.data_ptr(); mean_t* var_data = rstd.data_ptr(); - weight_t* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; - weight_t* beta_data = beta.defined() ? beta.data_ptr() : nullptr; + const weight_t* gamma_data = + gamma.defined() ? gamma.const_data_ptr() : nullptr; + const weight_t* beta_data = + beta.defined() ? beta.const_data_ptr() : nullptr; auto config = NormConfig(M, N, 1, sizeof(scalar_t)); bool can_use_32bit_index = canUse32BitIndexMath(X); @@ -338,8 +340,8 @@ struct GammaBetaBackwardSimpleKernelFunctor (group_id * cfg.workgroup_size + local_col_id) * vec_size; if (plane_offset < cfg.problem_size) { auto offset = row_id * cfg.problem_size + plane_offset; - vec_t X_val = *(reinterpret_cast(X_data + offset)); - vec_t dY_val = *(reinterpret_cast(dY_data + offset)); + vec_t X_val = *(reinterpret_cast(X_data + offset)); + vec_t dY_val = *(reinterpret_cast(dY_data + offset)); #pragma unroll(vec_size) for (int v = 0; v < vec_size; ++v) { dg_sum1[v] += (dg_data == nullptr) @@ -411,8 +413,8 @@ struct GammaBetaBackwardSimpleKernelFunctor const mean_t* mean_data_, const mean_t* var_data_, NormConfig cfg_, - scalar_t* dY_data_, - scalar_t* X_data_, + const scalar_t* dY_data_, + const scalar_t* X_data_, weight_t* dg_data_, weight_t* db_data_) : mean_data(mean_data_), @@ -429,8 +431,8 @@ struct GammaBetaBackwardSimpleKernelFunctor const mean_t* mean_data; const mean_t* var_data; NormConfig cfg; - scalar_t* dY_data; - scalar_t* X_data; + const scalar_t* dY_data; + const scalar_t* X_data; weight_t* dg_data; weight_t* db_data; sycl_local_acc_t local_sum1; @@ -451,8 +453,8 @@ void vec_gamma_beta_bwd_simple_kernel( Tensor& dgamma, Tensor& dbeta, NormConfig& cfg) { - scalar_t* dY_data = dY.data_ptr(); - scalar_t* X_data = X.data_ptr(); + const scalar_t* dY_data = dY.const_data_ptr(); + const scalar_t* X_data = X.const_data_ptr(); weight_t* dg_data = dgamma.defined() ? dgamma.data_ptr() : nullptr; weight_t* db_data = dbeta.defined() ? dbeta.data_ptr() : nullptr; @@ -536,14 +538,15 @@ void _layer_norm_backward_kernel( TORCH_CHECK(rstd.numel() == M); using accscalar_t = acc_type_device; - mean_t* mean_data = mean.data_ptr(); - mean_t* var_data = rstd.data_ptr(); - weight_t* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; + const mean_t* mean_data = mean.const_data_ptr(); + const mean_t* var_data = rstd.const_data_ptr(); + const weight_t* gamma_data = + gamma.defined() ? gamma.const_data_ptr() : nullptr; if (grad_input_mask[0]) { // backward data - scalar_t* X_data = X.data_ptr(); - scalar_t* dY_data = dY.data_ptr(); + const scalar_t* X_data = X.const_data_ptr(); + const scalar_t* dY_data = dY.const_data_ptr(); scalar_t* dX_data = dX.data_ptr(); auto config = NormConfig(M, N, 1, sizeof(scalar_t)); diff --git a/src/ATen/native/xpu/sycl/MaxUnpoolingKernels.cpp b/src/ATen/native/xpu/sycl/MaxUnpoolingKernels.cpp index 5853c53dd..580809570 100644 --- a/src/ATen/native/xpu/sycl/MaxUnpoolingKernels.cpp +++ b/src/ATen/native/xpu/sycl/MaxUnpoolingKernels.cpp @@ -183,8 +183,8 @@ struct MaxUnpooling3dForwardKernelFunctor { } } MaxUnpooling3dForwardKernelFunctor( - scalar_t* input_data, - int64_t* indices_data, + const scalar_t* input_data, + const int64_t* indices_data, scalar_t* output_data, const int64_t batchSize, const int64_t inputSlices, @@ -209,8 +209,8 @@ struct MaxUnpooling3dForwardKernelFunctor { offsetZ_(offsetZ) {} private: - scalar_t* input_data_; - int64_t* indices_data_; + const scalar_t* input_data_; + const int64_t* indices_data_; scalar_t* output_data_; const int64_t batchSize_; const int64_t inputSlices_; @@ -225,8 +225,8 @@ struct MaxUnpooling3dForwardKernelFunctor { template void max_unpooling3d_forward_kernel( - scalar_t* input, - int64_t* indices, + const scalar_t* input, + const int64_t* indices, scalar_t* output, const int64_t batchSize, const int64_t inputSlices, @@ -443,9 +443,9 @@ Tensor& max_unpooling3d_forward_kernel( "max_unpooling3d_forward_xpu", ([&] { max_unpooling3d_forward_kernel( - self.data_ptr(), - indices.data_ptr(), - output.data_ptr(), + self.const_data_ptr(), + indices.const_data_ptr(), + output.mutable_data_ptr(), batchSize, inputSlices, inputTime, diff --git a/src/ATen/native/xpu/sycl/MemoryAccess.h b/src/ATen/native/xpu/sycl/MemoryAccess.h index 3670b1c81..4843e170b 100644 --- a/src/ATen/native/xpu/sycl/MemoryAccess.h +++ b/src/ATen/native/xpu/sycl/MemoryAccess.h @@ -278,7 +278,7 @@ static inline int preferred_vector_width(at::DeviceIndex dev_id, int elem_sz) { // get enough payloads without outer loop. Outer loop may bring additional // instructions and potential registers usage. template -inline int can_vectorize_up_to(char* pointer) { +inline int can_vectorize_up_to(const char* pointer) { int elem_size = sizeof(scalar_t); at::DeviceIndex dev_id = c10::xpu::current_device(); int preferred_width = preferred_vector_width(dev_id, elem_size); @@ -303,6 +303,11 @@ inline int can_vectorize_up_to(char* pointer) { return 1; } +template +inline int can_vectorize_up_to(char* pointer) { + return can_vectorize_up_to(static_cast(pointer)); +} + template struct can_vectorize_up_to_helper { template diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp index 0cb77d52f..5b1a86973 100644 --- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp +++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp @@ -51,10 +51,10 @@ struct CopyIfFunc { bool operator()(int64_t x) const { return self_begin_[x] != scalar_t(0); } - CopyIfFunc(scalar_t* self_begin) : self_begin_(self_begin) {} + CopyIfFunc(const scalar_t* self_begin) : self_begin_(self_begin) {} private: - scalar_t* self_begin_; + const scalar_t* self_begin_; }; template <> @@ -68,10 +68,10 @@ struct CopyIfFunc { bool res = in != int(0) ? 1 : 0; return res; } - CopyIfFunc(bool* self_begin) : self_begin_(self_begin) {} + CopyIfFunc(const bool* self_begin) : self_begin_(self_begin) {} private: - bool* self_begin_; + const bool* self_begin_; }; template @@ -89,7 +89,7 @@ void nonzero_template(const Tensor& self_, Tensor& tensor) { Tensor range = at::empty( {N}, tensor.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT)); - scalar_t* self_begin = self.data_ptr(); + const scalar_t* self_begin = self.const_data_ptr(); int64_t* idx_flat_begin = idx_flat.data_ptr(); int64_t* range_begin = nullptr; diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h index 48d605a4e..6dd893100 100644 --- a/src/ATen/native/xpu/sycl/Norm.h +++ b/src/ATen/native/xpu/sycl/Norm.h @@ -366,12 +366,12 @@ class NormForward { using accscalar_t = acc_type_device; NormForward() = delete; NormForward( - scalar_t* X_data, + const scalar_t* X_data, scalar_t* Y_data, mean_t* mean_data, mean_t* var_data, - weight_t* gamma_data, - weight_t* beta_data, + const weight_t* gamma_data, + const weight_t* beta_data, accscalar_t eps) : X_data(X_data), Y_data(Y_data), @@ -384,7 +384,7 @@ class NormForward { int get_rowwise_reduce_vec_size(int problem_size, int vec_size) { vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(X_data))); + can_vectorize_up_to(reinterpret_cast(X_data))); while (problem_size % vec_size != 0) { vec_size = vec_size >> 1; @@ -395,19 +395,21 @@ class NormForward { int get_update_vec_size(int problem_size, int vec_size) { vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(X_data))); + can_vectorize_up_to(reinterpret_cast(X_data))); vec_size = std::min( vec_size, can_vectorize_up_to(reinterpret_cast(Y_data))); if (gamma_data) { vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(gamma_data))); + can_vectorize_up_to( + reinterpret_cast(gamma_data))); } if (beta_data) { vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(gamma_data))); + can_vectorize_up_to( + reinterpret_cast(gamma_data))); } while (problem_size % vec_size != 0) { @@ -419,7 +421,7 @@ class NormForward { int get_eltwise_update_vec_size(int vec_size) { vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(X_data))); + can_vectorize_up_to(reinterpret_cast(X_data))); vec_size = std::min( vec_size, can_vectorize_up_to(reinterpret_cast(Y_data))); @@ -446,8 +448,8 @@ class NormForward { j += cfg.workgroup_size * vec_size) { index_t plane_offset = group_id_foreach * cfg.workgroup_work_size + j; if (plane_offset < (index_t)cfg.problem_size) { - vec_t value = - *(reinterpret_cast(X_data + group_offset + plane_offset)); + vec_t value = *(reinterpret_cast( + X_data + group_offset + plane_offset)); for (int v = 0; v < vec_size; ++v) { sum1 += static_cast(value[v]); sum2 += static_cast(value[v]) * @@ -473,12 +475,12 @@ class NormForward { } public: - scalar_t* X_data; + const scalar_t* X_data; scalar_t* Y_data; mean_t* mean_data; mean_t* var_data; - weight_t* gamma_data; - weight_t* beta_data; + const weight_t* gamma_data; + const weight_t* beta_data; accscalar_t eps; }; @@ -491,12 +493,12 @@ class NormBackward { public: using accscalar_t = acc_type_device; NormBackward( - scalar_t* X_data, - scalar_t* dY_data, + const scalar_t* X_data, + const scalar_t* dY_data, scalar_t* dX_data, - mean_t* mean_data, - mean_t* var_data, - weight_t* gamma_data, + const mean_t* mean_data, + const mean_t* var_data, + const weight_t* gamma_data, accscalar_t* a_data, accscalar_t* b_data) : X_data(X_data), @@ -508,26 +510,27 @@ class NormBackward { a_data(a_data), b_data(b_data) {} - scalar_t* X_data; - scalar_t* dY_data; + const scalar_t* X_data; + const scalar_t* dY_data; scalar_t* dX_data; - mean_t* mean_data; - mean_t* var_data; - weight_t* gamma_data; + const mean_t* mean_data; + const mean_t* var_data; + const weight_t* gamma_data; accscalar_t* a_data; accscalar_t* b_data; int get_rowwise_reduce_vec_size(int problem_size, int vec_size) { vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(X_data))); + can_vectorize_up_to(reinterpret_cast(X_data))); vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(dY_data))); + can_vectorize_up_to(reinterpret_cast(dY_data))); if (gamma_data) { vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(gamma_data))); + can_vectorize_up_to( + reinterpret_cast(gamma_data))); } while (problem_size % vec_size != 0) { @@ -539,17 +542,18 @@ class NormBackward { int get_update_vec_size(int problem_size, int vec_size) { vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(X_data))); + can_vectorize_up_to(reinterpret_cast(X_data))); vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(dY_data))); + can_vectorize_up_to(reinterpret_cast(dY_data))); vec_size = std::min( vec_size, can_vectorize_up_to(reinterpret_cast(dX_data))); if (gamma_data) { vec_size = std::min( vec_size, - can_vectorize_up_to(reinterpret_cast(gamma_data))); + can_vectorize_up_to( + reinterpret_cast(gamma_data))); } while (problem_size % vec_size != 0) { diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp index ef838a969..0a88bbb3a 100644 --- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp @@ -92,7 +92,7 @@ struct ReflectionPad1dKernelFunctor { } } ReflectionPad1dKernelFunctor( - scalar_t* input_data, + const scalar_t* input_data, scalar_t* output_data, int64_t input_w, int64_t pad_l, @@ -104,7 +104,7 @@ struct ReflectionPad1dKernelFunctor { output_w_(output_w) {} private: - scalar_t* input_data_; + const scalar_t* input_data_; scalar_t* output_data_; int64_t input_w_; int64_t pad_l_; @@ -113,7 +113,7 @@ struct ReflectionPad1dKernelFunctor { template void reflection_pad1d_template( - scalar_t* input, + const scalar_t* input, scalar_t* output, int64_t input_w, int64_t pad_l, @@ -150,7 +150,7 @@ struct ReflectionPad1dBackwardKernelFunctor { } ReflectionPad1dBackwardKernelFunctor( scalar_t* grad_input_data, - scalar_t* grad_output_data, + const scalar_t* grad_output_data, int64_t input_w, int64_t pad_l, int64_t output_w) @@ -162,7 +162,7 @@ struct ReflectionPad1dBackwardKernelFunctor { private: scalar_t* grad_input_data_; - scalar_t* grad_output_data_; + const scalar_t* grad_output_data_; int64_t input_w_; int64_t pad_l_; int64_t output_w_; @@ -171,7 +171,7 @@ struct ReflectionPad1dBackwardKernelFunctor { template void reflection_pad1d_backward_template( scalar_t* grad_input, - scalar_t* grad_output, + const scalar_t* grad_output, int64_t input_w, int64_t pad_l, int64_t pad_r, @@ -211,7 +211,7 @@ struct ReflectionPad2dKernellFunctor { } } ReflectionPad2dKernellFunctor( - scalar_t* input, + const scalar_t* input, scalar_t* output, int64_t input_dim_x, int64_t input_dim_y, @@ -229,7 +229,7 @@ struct ReflectionPad2dKernellFunctor { output_dim_y_(output_dim_y) {} private: - scalar_t* input_; + const scalar_t* input_; scalar_t* output_; int64_t input_dim_x_; int64_t input_dim_y_; @@ -241,7 +241,7 @@ struct ReflectionPad2dKernellFunctor { template void reflection_pad2d_template( - scalar_t* input, + const scalar_t* input, scalar_t* output, int64_t input_dim_x, int64_t input_dim_y, @@ -298,7 +298,7 @@ struct ReflectionPad2dBackwardKernelFunctor { } ReflectionPad2dBackwardKernelFunctor( scalar_t* grad_input, - scalar_t* grad_output, + const scalar_t* grad_output, int64_t input_dim_x, int64_t input_dim_y, int64_t pad_t, @@ -316,7 +316,7 @@ struct ReflectionPad2dBackwardKernelFunctor { private: scalar_t* grad_input_; - scalar_t* grad_output_; + const scalar_t* grad_output_; int64_t input_dim_x_; int64_t input_dim_y_; int64_t pad_t_; @@ -328,7 +328,7 @@ struct ReflectionPad2dBackwardKernelFunctor { template void reflection_pad2d_backward_template( scalar_t* grad_input, - scalar_t* grad_output, + const scalar_t* grad_output, int64_t input_dim_x, int64_t input_dim_y, int64_t pad_t, @@ -360,7 +360,7 @@ void reflection_pad2d_backward_template( kfn); } -template +template struct ParallelReflectionPad3dKernelFunctor { void operator()(sycl::nd_item<3> item) const { auto output_id = item.get_global_id(2); @@ -402,8 +402,8 @@ struct ParallelReflectionPad3dKernelFunctor { input_x); } ParallelReflectionPad3dKernelFunctor( - PackedTensorAccessor64 input, - PackedTensorAccessor64 output, + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_top, int64_t pad_front, @@ -418,8 +418,8 @@ struct ParallelReflectionPad3dKernelFunctor { output_plane_size_(output_plane_size) {} private: - PackedTensorAccessor64 input_; - PackedTensorAccessor64 output_; + PackedTensorAccessor64 input_; + PackedTensorAccessor64 output_; int64_t pad_left_; int64_t pad_top_; int64_t pad_front_; @@ -427,10 +427,10 @@ struct ParallelReflectionPad3dKernelFunctor { int64_t output_plane_size_; }; -template +template inline void parallel_reflection_pad3d( - PackedTensorAccessor64 input, - PackedTensorAccessor64 output, + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_top, int64_t pad_front, @@ -442,7 +442,7 @@ inline void parallel_reflection_pad3d( int64_t nplane = input.size(1); int64_t nbatch = input.size(0); - ParallelReflectionPad3dKernelFunctor kfn( + ParallelReflectionPad3dKernelFunctor kfn( input, output, pad_left, pad_top, pad_front, f, output_plane_size); sycl_kernel_submit( sycl::range<3>(nbatch, nplane, work_group_size * work_group_num), @@ -454,7 +454,7 @@ inline void parallel_reflection_pad3d( template struct reflection_pad3d_kernel_functor { void operator()( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int64_t plane, int64_t batch, @@ -471,7 +471,7 @@ struct reflection_pad3d_kernel_functor { template void reflection_pad3d_template( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_top, @@ -484,7 +484,7 @@ template struct reflection_pad3d_backward_kernel_functor { void operator()( PackedTensorAccessor64 grad_input, - PackedTensorAccessor64 grad_output, + PackedTensorAccessor64 grad_output, int64_t plane, int64_t batch, int64_t output_z, @@ -503,7 +503,7 @@ struct reflection_pad3d_backward_kernel_functor { template void reflection_pad3d_backward_template( PackedTensorAccessor64 grad_input, - PackedTensorAccessor64 grad_output, + PackedTensorAccessor64 grad_output, int64_t pad_left, int64_t pad_top, int64_t pad_front) { @@ -546,8 +546,8 @@ void reflection_pad1d_kernel( AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( kHalf, kBFloat16, input.scalar_type(), "reflection_pad1d_xpu", [&] { reflection_pad1d_template( - input.data_ptr(), - output.data_ptr(), + input.const_data_ptr(), + output.mutable_data_ptr(), input_w, pad_l, pad_r, @@ -603,8 +603,8 @@ void reflection_pad1d_backward_kernel( "reflection_pad1d_backward_xpu", [&] { reflection_pad1d_backward_template( - grad_input.data_ptr(), - grad_output.data_ptr(), + grad_input.mutable_data_ptr(), + grad_output.const_data_ptr(), input_w, pad_l, pad_r, @@ -698,8 +698,8 @@ void reflection_pad2d_kernel( AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( kHalf, kBFloat16, input.scalar_type(), "reflection_pad2d_xpu", [&] { reflection_pad2d_template( - input.data_ptr(), - output.data_ptr(), + input.const_data_ptr(), + output.mutable_data_ptr(), input_w, input_h, pad_t, @@ -775,8 +775,8 @@ void reflection_pad2d_backward_kernel( "reflection_pad2d_backward_xpu", [&] { reflection_pad2d_backward_template( - grad_input.data_ptr(), - grad_output.data_ptr(), + grad_input.mutable_data_ptr(), + grad_output.const_data_ptr(), input_w, input_h, pad_t, @@ -816,7 +816,7 @@ void reflection_pad3d_kernel( output_inner = output.unsqueeze(0); } - auto input_packed = input_inner.packed_accessor64(); + auto input_packed = input_inner.packed_accessor64(); auto output_packed = output_inner.packed_accessor64(); reflection_pad3d_template( @@ -861,7 +861,8 @@ void reflection_pad3d_backward_kernel( } auto grad_input_packed = grad_input_.packed_accessor64(); - auto grad_output_packed = grad_output_.packed_accessor64(); + auto grad_output_packed = + grad_output_.packed_accessor64(); reflection_pad3d_backward_template( grad_input_packed, diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp index 75b9322b3..d88ad407e 100644 --- a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp @@ -24,7 +24,7 @@ inline int imax(int a, int b) { return a > b ? a : b; } -template +template struct ParallelReplicationPad1dKernelFunctor { void operator()(sycl::nd_item<3> item) const { auto output_id = item.get_global_id(2); @@ -45,8 +45,8 @@ struct ParallelReplicationPad1dKernelFunctor { } } ParallelReplicationPad1dKernelFunctor( - PackedTensorAccessor64 input, - PackedTensorAccessor64 output, + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_right, const F f, @@ -59,25 +59,25 @@ struct ParallelReplicationPad1dKernelFunctor { output_plane_size_(output_plane_size) {} private: - PackedTensorAccessor64 input_; - PackedTensorAccessor64 output_; + PackedTensorAccessor64 input_; + PackedTensorAccessor64 output_; int64_t pad_left_; int64_t pad_right_; const F f_; int64_t output_plane_size_; }; -template +template void parallel_replication_pad1d( - PackedTensorAccessor64 input, - PackedTensorAccessor64 output, + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_right, const F& f) { auto queue = getCurrentSYCLQueue(); int64_t output_plane_size = output.size(2); - ParallelReplicationPad1dKernelFunctor kfn( + ParallelReplicationPad1dKernelFunctor kfn( input, output, pad_left, pad_right, f, output_plane_size); int64_t work_group_size = syclMaxWorkGroupSize(kfn); @@ -95,7 +95,7 @@ void parallel_replication_pad1d( template struct ReplicationPad1dForwardFunctor { void operator()( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int64_t plane, int64_t batch, @@ -108,7 +108,7 @@ struct ReplicationPad1dForwardFunctor { template void replication_pad1d_forward_template( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_right) { @@ -120,7 +120,7 @@ template struct ReplicationPad1dBackwardFunctor { void operator()( PackedTensorAccessor64 grad_input, - PackedTensorAccessor64 grad_output, + PackedTensorAccessor64 grad_output, int64_t plane, int64_t batch, int64_t output_x, @@ -135,14 +135,14 @@ struct ReplicationPad1dBackwardFunctor { template void replication_pad1d_backward_template( PackedTensorAccessor64 grad_input, - PackedTensorAccessor64 grad_output, + PackedTensorAccessor64 grad_output, int64_t pad_left, int64_t pad_right) { ReplicationPad1dBackwardFunctor f; parallel_replication_pad1d(grad_input, grad_output, pad_left, pad_right, f); } -template +template struct ParallelReplicationPad2dKernelFunctor { void operator()(sycl::nd_item<3> item) const { const int output_id = item.get_global_id(2); @@ -169,32 +169,32 @@ struct ParallelReplicationPad2dKernelFunctor { } } ParallelReplicationPad2dKernelFunctor( - PackedTensorAccessor64 input, - PackedTensorAccessor64 output, + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, int64_t padT, int64_t padL, const F f) : input_(input), output_(output), padT_(padT), padL_(padL), f_(f) {} private: - PackedTensorAccessor64 input_; - PackedTensorAccessor64 output_; + PackedTensorAccessor64 input_; + PackedTensorAccessor64 output_; int64_t padT_; int64_t padL_; const F f_; }; -template +template void parallel_replication_pad2d( - PackedTensorAccessor64 input, - PackedTensorAccessor64 output, + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, const int padT, const int padL, const F& f) { auto queue = getCurrentSYCLQueue(); int64_t output_plane_size = output.size(2) * output.size(3); - ParallelReplicationPad2dKernelFunctor kfn( + ParallelReplicationPad2dKernelFunctor kfn( input, output, padT, padL, f); int64_t work_group_size = syclMaxWorkGroupSize(kfn); @@ -212,7 +212,7 @@ void parallel_replication_pad2d( template struct ReplicationPad2dForwardFunctor { void operator()( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int64_t batch, int64_t plane, @@ -227,7 +227,7 @@ struct ReplicationPad2dForwardFunctor { template void replication_pad2d_forward_template( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int64_t padT, int64_t padL) { @@ -239,7 +239,7 @@ template struct ReplicationPad2dBackwardFunctor { void operator()( PackedTensorAccessor64 grad_input, - PackedTensorAccessor64 grad_output, + PackedTensorAccessor64 grad_output, int64_t batch, int64_t plane, int64_t input_x, @@ -256,14 +256,14 @@ struct ReplicationPad2dBackwardFunctor { template void replication_pad2d_backward_template( PackedTensorAccessor64 grad_input, - PackedTensorAccessor64 grad_output, + PackedTensorAccessor64 grad_output, const int padT, const int padL) { ReplicationPad2dBackwardFunctor f; parallel_replication_pad2d(grad_input, grad_output, padT, padL, f); } -template +template struct ParallelReplicationPad3dKernelFunctor { void operator()(sycl::nd_item<3> item) const { auto output_id = item.get_global_id(2); @@ -302,8 +302,8 @@ struct ParallelReplicationPad3dKernelFunctor { } } ParallelReplicationPad3dKernelFunctor( - PackedTensorAccessor64 input, - PackedTensorAccessor64 output, + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_top, int64_t pad_front, @@ -318,8 +318,8 @@ struct ParallelReplicationPad3dKernelFunctor { output_plane_size_(output_plane_size) {} private: - PackedTensorAccessor64 input_; - PackedTensorAccessor64 output_; + PackedTensorAccessor64 input_; + PackedTensorAccessor64 output_; int64_t pad_left_; int64_t pad_top_; int64_t pad_front_; @@ -327,10 +327,10 @@ struct ParallelReplicationPad3dKernelFunctor { int64_t output_plane_size_; }; -template +template void parallel_replication_pad3d( - PackedTensorAccessor64 input, - PackedTensorAccessor64 output, + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_top, int64_t pad_front, @@ -338,7 +338,7 @@ void parallel_replication_pad3d( auto queue = getCurrentSYCLQueue(); int64_t output_plane_size = output.size(2) * output.size(3) * output.size(4); - ParallelReplicationPad3dKernelFunctor kfn( + ParallelReplicationPad3dKernelFunctor kfn( input, output, pad_left, pad_top, pad_front, f, output_plane_size); int64_t work_group_size = syclMaxWorkGroupSize(kfn); int64_t work_group_num = at::ceil_div(output_plane_size, work_group_size); @@ -355,7 +355,7 @@ void parallel_replication_pad3d( template struct ReplicationPad3dForwardFunctor { void operator()( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int64_t plane, int64_t batch, @@ -372,7 +372,7 @@ struct ReplicationPad3dForwardFunctor { template void replication_pad3d_forward_template( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_top, @@ -385,7 +385,7 @@ template struct ReplicationPad3dBackwardFunctor { void operator()( PackedTensorAccessor64 grad_input, - PackedTensorAccessor64 grad_output, + PackedTensorAccessor64 grad_output, int64_t plane, int64_t batch, int64_t output_z, @@ -404,7 +404,7 @@ struct ReplicationPad3dBackwardFunctor { template void replication_pad3d_backward_template( PackedTensorAccessor64 grad_input, - PackedTensorAccessor64 grad_output, + PackedTensorAccessor64 grad_output, int64_t pad_left, int64_t pad_top, int64_t pad_front) { @@ -438,7 +438,7 @@ void replication_pad1d_kernel( output_ = output.unsqueeze(0); } - auto input_packed = input_.packed_accessor64(); + auto input_packed = input_.packed_accessor64(); auto output_packed = output_.packed_accessor64(); replication_pad1d_forward_template( @@ -484,7 +484,8 @@ void replication_pad1d_backward_kernel( grad_output_ = grad_output.unsqueeze(0); } auto grad_input_packed = grad_input_.packed_accessor64(); - auto grad_output_packed = grad_output_.packed_accessor64(); + auto grad_output_packed = + grad_output_.packed_accessor64(); replication_pad1d_backward_template( grad_input_packed, grad_output_packed, pad_left, pad_right); @@ -511,7 +512,7 @@ void replication_pad2d_kernel( input_ = input.unsqueeze(0); output_ = output.unsqueeze(0); } - auto devInput = input_.packed_accessor64(); + auto devInput = input_.packed_accessor64(); auto devOutput = output_.packed_accessor64(); replication_pad2d_forward_template( devInput, devOutput, padT, padL); @@ -583,7 +584,8 @@ void replication_pad2d_backward_kernel( grad_output_ = grad_output.unsqueeze(0); } auto grad_input_packed = grad_input_.packed_accessor64(); - auto grad_output_packed = grad_output_.packed_accessor64(); + auto grad_output_packed = + grad_output_.packed_accessor64(); replication_pad2d_backward_template( grad_input_packed, grad_output_packed, padT, padL); @@ -612,7 +614,7 @@ void replication_pad3d_kernel( output_ = output.unsqueeze(0); } - auto input_packed = input_.packed_accessor64(); + auto input_packed = input_.packed_accessor64(); auto output_packed = output_.packed_accessor64(); replication_pad3d_forward_template( @@ -752,7 +754,8 @@ void replication_pad3d_backward_kernel( grad_output_ = grad_output.unsqueeze(0); } auto grad_input_packed = grad_input_.packed_accessor64(); - auto grad_output_packed = grad_output_.packed_accessor64(); + auto grad_output_packed = + grad_output_.packed_accessor64(); replication_pad3d_backward_template( grad_input_packed, grad_output_packed, diff --git a/src/ATen/native/xpu/sycl/ScanUtils.h b/src/ATen/native/xpu/sycl/ScanUtils.h index afb8ed496..45d72058f 100644 --- a/src/ATen/native/xpu/sycl/ScanUtils.h +++ b/src/ATen/native/xpu/sycl/ScanUtils.h @@ -1394,8 +1394,8 @@ void scan( TORCH_INTERNAL_ASSERT(self.is_contiguous()); - TensorInfo input_info = - getTensorInfo(input_); + TensorInfo input_info = + getTensorInfo(input_); int dim_after_collapse = input_info.collapseDims(dimension); TensorInfo output_info = @@ -1443,8 +1443,8 @@ void scan_with_indices( dimension, " out of range"); - TensorInfo input_info = - getTensorInfo(self); + TensorInfo input_info = + getTensorInfo(self); int dim_after_collapse = input_info.collapseDims(dimension); TensorInfo output_info = diff --git a/src/ATen/native/xpu/sycl/SegmentReduceKernels.cpp b/src/ATen/native/xpu/sycl/SegmentReduceKernels.cpp index a5f514777..e7b25d9d4 100644 --- a/src/ATen/native/xpu/sycl/SegmentReduceKernels.cpp +++ b/src/ATen/native/xpu/sycl/SegmentReduceKernels.cpp @@ -18,8 +18,8 @@ #include #endif -#include #include +#include namespace at::native::xpu { @@ -83,7 +83,7 @@ struct SegmentReduceForwardKernelFunctor { SegmentReduceForwardKernelFunctor( native::ReductionType reduction, scalar_t* output_data, - scalar_t* values_data, + const scalar_t* values_data, const index_t* lengths_data, const index_t* lengths_cumsum_data, const int64_t segment_count, @@ -119,7 +119,7 @@ struct SegmentReduceForwardKernelFunctor { private: native::ReductionType reduction_; scalar_t* output_data_; - scalar_t* values_data_; + const scalar_t* values_data_; const index_t* lengths_data_; const index_t* lengths_cumsum_data_; const int64_t segment_count_; @@ -140,7 +140,7 @@ template void segment_reduce_forward_kernel( native::ReductionType reduction, scalar_t* output_data, - scalar_t* values_data, + const scalar_t* values_data, const index_t* lengths_data, const index_t* lengths_cumsum_data, const int64_t segment_count, @@ -241,16 +241,16 @@ Tensor _segment_reduce_lengths_offsets_xpu_kernel( AT_DISPATCH_INDEX_TYPES( lengths_or_offsets.scalar_type(), "_segment_reduce_xpu_kernel1", ([&] { - auto* offsets_data_ptr = offsets.data_ptr(); - auto* lengths_data_ptr = lengths.data_ptr(); + auto* offsets_data_ptr = offsets.const_data_ptr(); + auto* lengths_data_ptr = lengths.const_data_ptr(); AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, data.scalar_type(), "segment_reduce_xpu", [&]() { - auto* data_data_ptr = data.data_ptr(); - auto* output_data_ptr = output.data_ptr(); + auto* data_data_ptr = data.const_data_ptr(); + auto* output_data_ptr = output.mutable_data_ptr(); // initialize starting value scalar_t initial_value; @@ -409,8 +409,8 @@ struct SegmentReduceBackwardKernelFunctor { SegmentReduceBackwardKernelFunctor( native::ReductionType reduction, scalar_t* grad_input_data, - scalar_t* grad_data, - scalar_t* output_data, + const scalar_t* grad_data, + const scalar_t* output_data, const scalar_t* values_data, const index_t* lengths_data, const index_t* lengths_cumsum_data, @@ -447,8 +447,8 @@ struct SegmentReduceBackwardKernelFunctor { private: native::ReductionType reduction_; scalar_t* grad_input_data_; - scalar_t* grad_data_; - scalar_t* output_data_; + const scalar_t* grad_data_; + const scalar_t* output_data_; const scalar_t* values_data_; const index_t* lengths_data_; const index_t* lengths_cumsum_data_; @@ -469,8 +469,8 @@ template void segment_reduce_backward_kernel( native::ReductionType reduction, scalar_t* grad_input_data, - scalar_t* grad_data, - scalar_t* output_data, + const scalar_t* grad_data, + const scalar_t* output_data, const scalar_t* values_data, const index_t* lengths_data, const index_t* lengths_cumsum_data, @@ -572,8 +572,8 @@ Tensor _segment_reduce_lengths_offsets_backward_xpu_kernel( lengths_or_offsets_contig.scalar_type(), "_segment_reduce_xpu_lengths_offsets_backward_kernel1", ([&] { - const auto* lengths_data = lengths.data_ptr(); - auto* offsets_data = offsets.data_ptr(); + const auto* lengths_data = lengths.const_data_ptr(); + auto* offsets_data = offsets.const_data_ptr(); // TODO: Switch to TensorIterator for better maintainablility and // readability @@ -583,10 +583,10 @@ Tensor _segment_reduce_lengths_offsets_backward_xpu_kernel( data_contig.scalar_type(), "_segment_reduce_xpu", ([&]() { - auto* output_data = output_contig.data_ptr(); - auto* grad_data = grad_contig.data_ptr(); - auto* grad_input_data = grad_input.data_ptr(); - const auto* values_data = data_contig.data_ptr(); + auto* output_data = output_contig.const_data_ptr(); + auto* grad_data = grad_contig.const_data_ptr(); + auto* grad_input_data = grad_input.mutable_data_ptr(); + const auto* values_data = data_contig.const_data_ptr(); scalar_t initial_prod_value; if (initial.has_value()) { diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp index eb1f0e090..722192d44 100644 --- a/src/ATen/native/xpu/sycl/Shape.cpp +++ b/src/ATen/native/xpu/sycl/Shape.cpp @@ -228,8 +228,8 @@ void parallel_cat( int64_t dimSize = at::native::size(inputs[i + batchCounter].get(), dimension); - stackInputs[batchCounter].input = static_cast( - inputs[i + batchCounter].get().data_ptr()); + stackInputs[batchCounter].input = + (scalar_in_t*)(inputs[i + batchCounter].get().const_data_ptr()); stackInputs[batchCounter].offset = offset; stackInputs[batchCounter].dimSize = dimSize; stackInputs[batchCounter].nElements = diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp index 81be06363..28d812f2c 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp @@ -240,7 +240,7 @@ struct DispatchSoftmaxForwardKernelFunctor if (index >= dim_size_) break; - reg_in[i] = *(reinterpret_cast(in_data_ + group_offset + index)); + reg_in[i] = *(reinterpret_cast(in_data_ + group_offset + index)); if constexpr (is_masked) { auto vec_offset = group_offset + index; #pragma unroll(vec_size) @@ -328,11 +328,11 @@ struct DispatchSoftmaxForwardKernelFunctor } DispatchSoftmaxForwardKernelFunctor( - scalar_t* in_data, + const scalar_t* in_data, scalar_t* out_data, int dim_size, int outer_size, - bool* mask_data, + const bool* mask_data, calc_t input_calc, int sub_group_num, int global_size_row, @@ -356,11 +356,11 @@ struct DispatchSoftmaxForwardKernelFunctor nan_(nan) {} private: - scalar_t* in_data_; + const scalar_t* in_data_; scalar_t* out_data_; int dim_size_; int outer_size_; - bool* mask_data_; + const bool* mask_data_; calc_t input_calc_; int sub_group_num_; int global_size_row_; @@ -388,11 +388,11 @@ template < bool is_masked = false, typename calc_t = decltype(nullptr)> bool dispatch_softmax_forward_kernel( - scalar_t* in_data, + const scalar_t* in_data, scalar_t* out_data, int dim_size, int outer_size, - bool* mask_data = nullptr, + const bool* mask_data = nullptr, calc_t input_calc = nullptr) { using vec_t = at::native::memory::aligned_vector; auto& queue = getCurrentSYCLQueue(); @@ -518,7 +518,7 @@ struct SoftmaxForwardKernelFunctor { // get max value auto max_value = std::numeric_limits::lowest(); for (int i = local_id; i < loops_end; i += local_size_) { - vec_t in_val = *(reinterpret_cast( + vec_t in_val = *(reinterpret_cast( in_data_ + group_offset - start + i * vec_size)); #pragma unroll(vec_size) for (IndexType j = 0; j < vec_size; ++j) { @@ -535,7 +535,7 @@ struct SoftmaxForwardKernelFunctor { // get sum value auto sum_value = accscalar_t(0); for (IndexType i = local_id; i < loops_end; i += local_size_) { - vec_t in_val = *(reinterpret_cast( + vec_t in_val = *(reinterpret_cast( in_data_ + group_offset - start + i * vec_size)); #pragma unroll(vec_size) for (int j = 0; j < vec_size; ++j) { @@ -569,7 +569,7 @@ struct SoftmaxForwardKernelFunctor { } } } else { - vec_t in_val = *(reinterpret_cast( + vec_t in_val = *(reinterpret_cast( in_data_ + group_offset - start + i * vec_size)); #pragma unroll(vec_size) for (int j = 0; j < vec_size; ++j) { @@ -586,7 +586,7 @@ struct SoftmaxForwardKernelFunctor { } } SoftmaxForwardKernelFunctor( - scalar_t* in_data, + const scalar_t* in_data, scalar_t* out_data, int dim_size, int outer_size, @@ -598,7 +598,7 @@ struct SoftmaxForwardKernelFunctor { local_size_(local_size) {} private: - scalar_t* in_data_; + const scalar_t* in_data_; scalar_t* out_data_; int dim_size_; int outer_size_; @@ -612,7 +612,7 @@ template < typename IndexType, bool LogSoftMax> void softmax_forward_kernel( - scalar_t* in_data, + const scalar_t* in_data, scalar_t* out_data, int dim_size, int outer_size) { @@ -658,14 +658,14 @@ struct SpatialSoftmaxForwardKernelFunctor // get max value accscalar_t max_value[vec_size]; auto offset = local_row_id * inner_size_ + global_col * vec_size; - vec_t value = *(reinterpret_cast(in_data_ + group_offset + offset)); + vec_t value = *(reinterpret_cast(in_data_ + group_offset + offset)); #pragma unroll(vec_size) for (int j = 0; j < vec_size; ++j) { max_value[j] = accscalar_t(value[j]); } for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) { offset = i * inner_size_ + global_col * vec_size; - value = *(reinterpret_cast(in_data_ + group_offset + offset)); + value = *(reinterpret_cast(in_data_ + group_offset + offset)); #pragma unroll(vec_size) for (int j = 0; j < vec_size; ++j) { max_value[j] = std::max(max_value[j], accscalar_t(value[j])); @@ -688,14 +688,14 @@ struct SpatialSoftmaxForwardKernelFunctor // get sum value accscalar_t sum_value[vec_size]; offset = local_row_id * inner_size_ + global_col * vec_size; - value = *(reinterpret_cast(in_data_ + group_offset + offset)); + value = *(reinterpret_cast(in_data_ + group_offset + offset)); #pragma unroll(vec_size) for (int j = 0; j < vec_size; ++j) { sum_value[j] = std::exp(value[j] - max_value[j]); } for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) { offset = i * inner_size_ + global_col * vec_size; - value = *(reinterpret_cast(in_data_ + group_offset + offset)); + value = *(reinterpret_cast(in_data_ + group_offset + offset)); #pragma unroll(vec_size) for (int j = 0; j < vec_size; ++j) { sum_value[j] += std::exp(value[j] - max_value[j]); @@ -730,7 +730,7 @@ struct SpatialSoftmaxForwardKernelFunctor for (int i = local_row_id; i < dim_size_; i += block_row_) { auto offset = i * inner_size_ + global_col * vec_size; vec_t in_val = - *(reinterpret_cast(in_data_ + group_offset + offset)); + *(reinterpret_cast(in_data_ + group_offset + offset)); #pragma unroll(vec_size) for (int j = 0; j < vec_size; ++j) { if (LogSoftMax) @@ -753,7 +753,7 @@ struct SpatialSoftmaxForwardKernelFunctor } SpatialSoftmaxForwardKernelFunctor( - scalar_t* in_data, + const scalar_t* in_data, scalar_t* out_data, int dim_size, int inner_size, @@ -771,7 +771,7 @@ struct SpatialSoftmaxForwardKernelFunctor group_num_(group_num) {} private: - scalar_t* in_data_; + const scalar_t* in_data_; scalar_t* out_data_; int dim_size_; int inner_size_; @@ -789,7 +789,7 @@ template < typename IndexType, bool LogSoftMax> void spatial_softmax_forward( - scalar_t* in_data, + const scalar_t* in_data, scalar_t* out_data, int dim_size, int inner_size, @@ -866,9 +866,10 @@ struct DispatchSoftmaxBackwardKernelFunctor if (index >= dim_size_) break; - reg_out[i] = *(reinterpret_cast(output_ + group_offset + index)); + reg_out[i] = + *(reinterpret_cast(output_ + group_offset + index)); reg_gradout[i] = - *(reinterpret_cast(gradOutput_ + group_offset + index)); + *(reinterpret_cast(gradOutput_ + group_offset + index)); if constexpr (is_masked) { auto vec_offset = group_offset + index; #pragma unroll(vec_size) @@ -929,11 +930,11 @@ struct DispatchSoftmaxBackwardKernelFunctor DispatchSoftmaxBackwardKernelFunctor( scalar_t* gradInput, - scalar_t* output, - scalar_t* gradOutput, + const scalar_t* output, + const scalar_t* gradOutput, int dim_size, int outer_size, - bool* mask_data, + const bool* mask_data, calc_t input_calc, int sub_group_num, int global_size_row, @@ -955,11 +956,11 @@ struct DispatchSoftmaxBackwardKernelFunctor private: scalar_t* gradInput_; - scalar_t* output_; - scalar_t* gradOutput_; + const scalar_t* output_; + const scalar_t* gradOutput_; int dim_size_; int outer_size_; - bool* mask_data_; + const bool* mask_data_; calc_t input_calc_; int sub_group_num_; int global_size_row_; @@ -981,11 +982,11 @@ template < typename calc_t = decltype(nullptr)> bool dispatch_softmax_backward_kernel( scalar_t* gradInput, - scalar_t* output, - scalar_t* gradOutput, + const scalar_t* output, + const scalar_t* gradOutput, int dim_size, int outer_size, - bool* mask_data = nullptr, + const bool* mask_data = nullptr, calc_t input_calc = nullptr) { using vec_t = at::native::memory::aligned_vector; auto& queue = getCurrentSYCLQueue(); @@ -1432,8 +1433,8 @@ void spatial_softmax_forward( uint32_t, \ LogSoftMax, \ outer_loop>( \ - input.data_ptr(), \ - output.data_ptr(), \ + input.const_data_ptr(), \ + output.mutable_data_ptr(), \ dim_size, \ outer_size); \ } @@ -1446,8 +1447,8 @@ void spatial_softmax_forward( accscalar_t, \ IndexType, \ LogSoftMax>( \ - input.data_ptr(), \ - output.data_ptr(), \ + input.const_data_ptr(), \ + output.mutable_data_ptr(), \ dim_size, \ outer_size); \ } @@ -1460,8 +1461,8 @@ void spatial_softmax_forward( accscalar_t, \ IndexType, \ LogSoftMax>( \ - input.data_ptr(), \ - output.data_ptr(), \ + input.const_data_ptr(), \ + output.mutable_data_ptr(), \ dim_size, \ inner_size, \ outer_size); \ @@ -1605,18 +1606,18 @@ void spatial_softmax_backward( accscalar_t, \ uint32_t, \ LogSoftMax>( \ - gradInput.data_ptr(), \ - output.data_ptr(), \ - gradOutput.data_ptr(), \ + gradInput.mutable_data_ptr(), \ + output.const_data_ptr(), \ + gradOutput.const_data_ptr(), \ dim_size, \ outer_size); \ } #define SOFTMAX_BACKWARD_IMPL(vec_size, IndexType) \ softmax_backward_kernel( \ - gradInput.data_ptr(), \ - output.data_ptr(), \ - gradOutput.data_ptr(), \ + gradInput.mutable_data_ptr(), \ + output.const_data_ptr(), \ + gradOutput.const_data_ptr(), \ dim_size, \ outer_size); @@ -1626,9 +1627,9 @@ void spatial_softmax_backward( scalar_t, \ accscalar_t, \ LogSoftMax>( \ - gradInput.data_ptr(), \ - output.data_ptr(), \ - gradOutput.data_ptr(), \ + gradInput.mutable_data_ptr(), \ + output.const_data_ptr(), \ + gradOutput.const_data_ptr(), \ dim_size, \ inner_size, \ outer_size); @@ -1718,9 +1719,9 @@ Tensor& masked_softmax_forward( using vec_t = at::native::memory::aligned_vector; constexpr int align_bytes = alignof(vec_t); int input_start = - ((uint64_t)input.data_ptr()) % align_bytes / sizeof(scalar_t); + ((uint64_t)input.const_data_ptr()) % align_bytes / sizeof(scalar_t); int output_start = - ((uint64_t)output.data_ptr()) % align_bytes / sizeof(scalar_t); + ((uint64_t)output.const_data_ptr()) % align_bytes / sizeof(scalar_t); // decide indexing range: uint32_t (4GB) or uint64_t (>4GB) bool can_use_32bit_index = @@ -1749,11 +1750,11 @@ Tensor& masked_softmax_forward( outer_loop, \ true, \ decltype(input_calc)>( \ - input.data_ptr(), \ - output.data_ptr(), \ + input.const_data_ptr(), \ + output.mutable_data_ptr(), \ dim_size, \ outer_size, \ - mask.data_ptr(), \ + mask.const_data_ptr(), \ input_calc); \ } @@ -1838,11 +1839,11 @@ void masked_softmax_backward( using vec_t = at::native::memory::aligned_vector; constexpr int align_bytes = alignof(vec_t); int gradin_start = - ((uint64_t)gradInput.data_ptr()) % align_bytes / sizeof(scalar_t); + ((uint64_t)gradInput.const_data_ptr()) % align_bytes / sizeof(scalar_t); int output_start = - ((uint64_t)output.data_ptr()) % align_bytes / sizeof(scalar_t); + ((uint64_t)output.const_data_ptr()) % align_bytes / sizeof(scalar_t); int gradoutput_start = - ((uint64_t)gradOutput.data_ptr()) % align_bytes / sizeof(scalar_t); + ((uint64_t)gradOutput.const_data_ptr()) % align_bytes / sizeof(scalar_t); // decide indexing range: uint32_t (4GB) or uint64_t (>4GB) bool can_use_32bit_index = canUse32BitIndexMath(gradInput) && @@ -1870,12 +1871,12 @@ void masked_softmax_backward( LogSoftMax, \ true, \ decltype(input_calc)>( \ - gradInput.data_ptr(), \ - output.data_ptr(), \ - gradOutput.data_ptr(), \ + gradInput.mutable_data_ptr(), \ + output.const_data_ptr(), \ + gradOutput.const_data_ptr(), \ dim_size, \ outer_size, \ - mask.data_ptr(), \ + mask.const_data_ptr(), \ input_calc); \ } diff --git a/src/ATen/native/xpu/sycl/Sorting.cpp b/src/ATen/native/xpu/sycl/Sorting.cpp index 05fba0bb9..a956004f4 100644 --- a/src/ATen/native/xpu/sycl/Sorting.cpp +++ b/src/ATen/native/xpu/sycl/Sorting.cpp @@ -141,7 +141,7 @@ void sort_stable_kernel( self_.scalar_type(), "sort_stable_kernel", [&]() { - scalar_t* self_ptr = self_.data_ptr(); + const scalar_t* self_ptr = self_.const_data_ptr(); int nsegments = numel / nsort; segmented_sort_pairs( self_ptr, @@ -172,11 +172,11 @@ struct GatherMedianKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { index_t indicesSliceStartIndex = IndexToOffset::get(slice, indices_); index_t inputSliceStartIndex = - IndexToOffset::get(slice, input_); + IndexToOffset::get(slice, input_); scalar_t* valuesSliceStart = values_data_ + valuesSliceStartIndex; int64_t* indicesSliceStart = indices_data_ + indicesSliceStartIndex; - scalar_t* inputSliceStart = in_data_ + inputSliceStartIndex; + const scalar_t* inputSliceStart = in_data_ + inputSliceStartIndex; index_t nan_count = 0; for (index_t i = item.get_local_id(0); i < inputSliceSize_; @@ -216,7 +216,7 @@ struct GatherMedianKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { typename TopKTypeConfig::RadixType, index_t, false>( - (sycl_global_ptr)inputSliceStart, + inputSliceStart, k + 1, inputSliceSize_, inputWithinSliceStride_, @@ -245,12 +245,12 @@ struct GatherMedianKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { GatherMedianKernelFunctor( TensorInfo values, TensorInfo indices, - TensorInfo input, + TensorInfo input, index_t inputSliceSize, index_t numInputSlices, index_t inputWithinSliceStride, bool ignore_nan, - scalar_t* in_data, + const scalar_t* in_data, scalar_t* values_data, int64_t* indices_data) : values_(values), @@ -267,12 +267,12 @@ struct GatherMedianKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { private: TensorInfo values_; TensorInfo indices_; - TensorInfo input_; + TensorInfo input_; index_t inputSliceSize_; index_t numInputSlices_; index_t inputWithinSliceStride_; bool ignore_nan_; - scalar_t* in_data_; + const scalar_t* in_data_; scalar_t* values_data_; int64_t* indices_data_; sycl_local_acc_t smem_; @@ -284,7 +284,7 @@ template void gatherMedian( TensorInfo values, TensorInfo indices, - TensorInfo input, + TensorInfo input, index_t inputSliceSize, index_t numInputSlices, index_t inputWithinSliceStride, @@ -323,7 +323,7 @@ struct MedianLauncher { int collapse_values_dim, TensorInfo indices_info, int collapse_indices_dim, - TensorInfo self_info, + TensorInfo self_info, int collapse_self_dim, int64_t num_slices, int64_t slice_size) { @@ -344,6 +344,7 @@ void launch_median_kernel( const TensorBase& self, int64_t dim, bool ignore_nan) { + // return; AT_DISPATCH_ALL_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, diff --git a/src/ATen/native/xpu/sycl/SortingCommon.h b/src/ATen/native/xpu/sycl/SortingCommon.h index a5779cb23..2d6a0520f 100644 --- a/src/ATen/native/xpu/sycl/SortingCommon.h +++ b/src/ATen/native/xpu/sycl/SortingCommon.h @@ -358,7 +358,7 @@ void run_launcher( const TensorBase& self, int64_t dim, Launcher l) { - auto self_info = getTensorInfo(self); + auto self_info = getTensorInfo(self); auto values_info = getTensorInfo(values); auto indices_info = getTensorInfo(indices); diff --git a/src/ATen/native/xpu/sycl/SortingRadixSelect.h b/src/ATen/native/xpu/sycl/SortingRadixSelect.h index f5df40567..d9c8e5fdb 100644 --- a/src/ATen/native/xpu/sycl/SortingRadixSelect.h +++ b/src/ATen/native/xpu/sycl/SortingRadixSelect.h @@ -223,7 +223,7 @@ void countRadixUsingMask( int radixDigitPos, index_t sliceSize, index_t withinSliceStride, - const sycl_global_ptr& data, + const scalar_t* data, sycl::nd_item<1>& item_id) { // Clear out per-thread counts from a previous round for (int i = 0; i < RadixSize; ++i) { @@ -276,7 +276,7 @@ constexpr int RADIX_MASK = (RADIX_SIZE - 1); template scalar_t findPattern( const sycl_local_acc_t& smem, - const sycl_global_ptr& data, + const scalar_t* data, index_t sliceSize, index_t withinSliceStride, bitwise_t desired, @@ -329,7 +329,7 @@ scalar_t findPattern( // Returns the top-Kth element found in the data using radix selection template void radixSelect( - const sycl_global_ptr& data, + const scalar_t* data, index_t k, index_t sliceSize, index_t withinSliceStride, diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp index 7a2835491..cc9d0ba52 100644 --- a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp @@ -52,7 +52,7 @@ struct Histogram1DKernelFunctor { auto linear_index = item_id.get_id(0); // Convert `linear_index` into an offset of `b` const IndexType b_offset = - IndexToOffset::get(linear_index, b_); + IndexToOffset::get(linear_index, b_); const auto b_val = in_ptr[b_offset]; if (b_val >= min_value_ && b_val <= max_value_) { // Use value at `b` as an offset of `a` @@ -67,7 +67,7 @@ struct Histogram1DKernelFunctor { } Histogram1DKernelFunctor( TensorInfo a, - TensorInfo b, + TensorInfo b, TensorInfo c, int nbins, at::acc_type_device minvalue, @@ -85,7 +85,7 @@ struct Histogram1DKernelFunctor { private: TensorInfo a_; - TensorInfo b_; + TensorInfo b_; TensorInfo c_; int nbins_; at::acc_type_device min_value_; @@ -106,7 +106,7 @@ template < typename Op> void histogram_1d_kernel( TensorInfo a, /* output */ - TensorInfo b, /* input */ + TensorInfo b, /* input */ TensorInfo c, /* weight */ int nbins, at::acc_type_device min_value, @@ -171,7 +171,7 @@ void tensor_histogram( using IndexType = int64_t; auto a_info = getTensorInfo(a); - auto b_info = getTensorInfo(b); + auto b_info = getTensorInfo(b); if (has_weights) { auto c_info = getTensorInfo(c); const IndexingFunctor get_weights_op( diff --git a/src/ATen/native/xpu/sycl/TensorModeKernel.cpp b/src/ATen/native/xpu/sycl/TensorModeKernel.cpp index f965f28ee..d9dfbe69a 100644 --- a/src/ATen/native/xpu/sycl/TensorModeKernel.cpp +++ b/src/ATen/native/xpu/sycl/TensorModeKernel.cpp @@ -556,8 +556,8 @@ struct ModeKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { } ModeKernelFunctor( - scalar_t* problem_values_ptr, - int64_t* problem_indices_ptr, + const scalar_t* problem_values_ptr, + const int64_t* problem_indices_ptr, TensorInfo values_info, TensorInfo indices_info, int64_t* scratch_status_ptr, @@ -580,8 +580,8 @@ struct ModeKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { problem_upper_limit_(problem_upper_limit) {} private: - scalar_t* problem_values_ptr_; - int64_t* problem_indices_ptr_; + const scalar_t* problem_values_ptr_; + const int64_t* problem_indices_ptr_; TensorInfo values_info_; TensorInfo indices_info_; int64_t* scratch_status_ptr_; @@ -634,7 +634,7 @@ struct ModeFusedKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { } ModeFusedKernelFunctor( - scalar_t* problem_values_ptr, + const scalar_t* problem_values_ptr, TensorInfo values_info, TensorInfo indices_info, int64_t sort_scratch_memory_size, @@ -652,7 +652,7 @@ struct ModeFusedKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { group_size_(group_size) {} private: - scalar_t* problem_values_ptr_; + const scalar_t* problem_values_ptr_; TensorInfo values_info_; TensorInfo indices_info_; int64_t sort_scratch_memory_size_; @@ -769,8 +769,8 @@ void mode_kernel_impl( ? (problem_size) : ((problem_size / group_size + 1) * group_size); - auto problem_values_ptr = problem_values.data_ptr(); - auto problem_indices_ptr = problem_indices.data_ptr(); + auto problem_values_ptr = problem_values.const_data_ptr(); + auto problem_indices_ptr = problem_indices.const_data_ptr(); auto scratch_status_ptr = scratch_status_tensor.data_ptr(); auto scratch_value_ptr = scratch_value_tensor.data_ptr(); ModeKernelFunctor kfn( @@ -809,8 +809,9 @@ void mode_kernel_impl( auto values_info = getTensorInfo(values_transposed); auto indices_info = getTensorInfo(indices_transposed); + + auto problem_values_ptr = contiguous.const_data_ptr(); - auto problem_values_ptr = contiguous.data_ptr(); ModeFusedKernelFunctor kfn( problem_values_ptr, values_info, diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp index 7ee34a16f..1e576f7bc 100644 --- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp +++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp @@ -160,7 +160,7 @@ struct RollKernelFunctor { } } RollKernelFunctor( - scalar_t* in_data, + const scalar_t* in_data, scalar_t* out_data, int val_of_work_item, int64_t N, @@ -182,7 +182,7 @@ struct RollKernelFunctor { global_range_(global_range) {} private: - scalar_t* in_data_; + const scalar_t* in_data_; scalar_t* out_data_; int val_of_work_item_; int64_t N_; @@ -220,7 +220,7 @@ void roll_template( global_range = global_range < target_global_range ? global_range : target_global_range; - auto in_data = in_tensor.data_ptr(); + auto in_data = in_tensor.const_data_ptr(); auto out_data = out_tensor.data_ptr(); KernelClass kfn( in_data, diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp index 45fcc655e..ba403e8a7 100644 --- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp @@ -43,7 +43,7 @@ struct ApplyTriuTrilKernelFunctor { IndexType result_stride_0_, IndexType result_stride_1_, scalar_t* result_ptr_, - scalar_t* self_ptr_) + const scalar_t* self_ptr_) : k(k_), N(N_), self_size_0(self_size_0_), @@ -69,7 +69,7 @@ struct ApplyTriuTrilKernelFunctor { IndexType result_stride_0; IndexType result_stride_1; scalar_t* result_ptr; - scalar_t* self_ptr; + const scalar_t* self_ptr; }; template @@ -89,7 +89,7 @@ void apply_triu_tril( IndexType result_stride_1 = (IndexType)result.stride(-1); scalar_t* result_ptr = result.data_ptr(); - scalar_t* self_ptr = self.data_ptr(); + const scalar_t* self_ptr = self.const_data_ptr(); ApplyTriuTrilKernelFunctor kfn( k, diff --git a/src/ATen/native/xpu/sycl/UniqueKernels.cpp b/src/ATen/native/xpu/sycl/UniqueKernels.cpp index 2e05d4be0..27e802bf2 100644 --- a/src/ATen/native/xpu/sycl/UniqueKernels.cpp +++ b/src/ATen/native/xpu/sycl/UniqueKernels.cpp @@ -154,7 +154,7 @@ std::tuple unique_template( if (!consecutive) { at::native::xpu::pstl::sort( - self_c.data_ptr(), + self_c.const_data_ptr(), output.data_ptr(), sorted_indices.data_ptr(), num_inp, @@ -258,13 +258,16 @@ struct UniqueDimLessFunctor { return false; } - UniqueDimLessFunctor(int64_t num_inp, int64_t n, scalar_t* input_flat_ptr) + UniqueDimLessFunctor( + int64_t num_inp, + int64_t n, + const scalar_t* input_flat_ptr) : num_inp_(num_inp), n_(n), input_flat_ptr_(input_flat_ptr) {} private: int64_t num_inp_; int64_t n_; - scalar_t* input_flat_ptr_; + const scalar_t* input_flat_ptr_; }; template @@ -280,12 +283,12 @@ struct UniqueDimEqualFunctor { } return true; } - UniqueDimEqualFunctor(int64_t n, scalar_t* input_flat_ptr) + UniqueDimEqualFunctor(int64_t n, const scalar_t* input_flat_ptr) : n_(n), input_flat_ptr_(input_flat_ptr) {} private: int64_t n_; - scalar_t* input_flat_ptr_; + const scalar_t* input_flat_ptr_; }; template @@ -301,12 +304,12 @@ struct UniqueDimNotEqualFunctor { } return false; } - UniqueDimNotEqualFunctor(int64_t n, scalar_t* input_flat_ptr) + UniqueDimNotEqualFunctor(int64_t n, const scalar_t* input_flat_ptr) : n_(n), input_flat_ptr_(input_flat_ptr) {} private: int64_t n_; - scalar_t* input_flat_ptr_; + const scalar_t* input_flat_ptr_; }; template @@ -338,7 +341,7 @@ std::tuple unique_dim_template( auto index_options = self.options().dtype(kLong); Tensor input_flat = self.moveaxis(dim, 0).contiguous().view({num_inp, -1}); int64_t n = input_flat.size(1); - scalar_t* input_flat_ptr = input_flat.data_ptr(); + const scalar_t* input_flat_ptr = input_flat.const_data_ptr(); Tensor indices = at::arange(0, num_inp, index_options); Tensor indices_idx = at::arange(0, num_inp, index_options); diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp index 653a293d8..923ebaac5 100644 --- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp @@ -59,7 +59,7 @@ struct UpsampleBilinear2dKernelFunctor { const accscalar_t rheight, const accscalar_t rwidth, const bool align_corners, - const PackedTensorAccessor idata_acc, + const PackedTensorAccessor idata_acc, PackedTensorAccessor odata_acc, int64_t input_height, int64_t input_width, @@ -85,7 +85,7 @@ struct UpsampleBilinear2dKernelFunctor { const accscalar_t rheight_; const accscalar_t rwidth_; const bool align_corners_; - const PackedTensorAccessor in_data_acc_; + const PackedTensorAccessor in_data_acc_; PackedTensorAccessor out_data_acc_; int64_t input_height_; int64_t input_width_; @@ -101,7 +101,7 @@ void launch_upsample_bilinear2d_kernel( const accscalar_t rheight, const accscalar_t rwidth, const bool align_corners, - const PackedTensorAccessor idata_acc, + const PackedTensorAccessor idata_acc, PackedTensorAccessor odata_acc, int64_t input_height, int64_t input_width, @@ -312,7 +312,7 @@ void upsample_bilinear2d_out_kernel( "upsample_bilinear2d_xpu", [&] { using accscalar_t = acc_type_device; - auto idata_acc = input.packed_accessor64(); + auto idata_acc = input.packed_accessor64(); auto odata_acc = output.packed_accessor64(); const accscalar_t rheight = area_pixel_compute_scale( @@ -382,8 +382,8 @@ void upsample_bilinear2d_backward_out_kernel( : at::zeros(grad_input.sizes(), grad_input.options()); Tensor grad_output = grad_output_.contiguous(); - scalar_t* idata = grad_input_c.data_ptr(); - scalar_t* odata = grad_output.data_ptr(); + scalar_t* idata = grad_input_c.mutable_data_ptr(); + const scalar_t* odata = grad_output.const_data_ptr(); const accscalar_t rheight = area_pixel_compute_scale( input_height, output_height, align_corners, scales_h); diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp index 0751749fc..bd7e7ccff 100644 --- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.cpp @@ -125,8 +125,8 @@ void upsample_nearest1d_backward_kernel( "upsample_nearest1d_backward_xpu", [&] { using accscalar_t = acc_type_device; - auto idata = grad_input_c.data_ptr(); - auto odata = grad_output.data_ptr(); + auto idata = grad_input_c.mutable_data_ptr(); + auto odata = grad_output.const_data_ptr(); const float scale_factor = compute_scales_value_backwards( scales, output_width, input_width); if (is_exact) { @@ -277,8 +277,8 @@ void upsample_nearest1d_kernel( input.scalar_type(), "upsample_nearest1d_xpu", [&] { - auto idata = input.data_ptr(); - auto odata = output_c.data_ptr(); + auto idata = input.const_data_ptr(); + auto odata = output_c.mutable_data_ptr(); const float scale_factor = compute_scales_value(scales, input_width, output_width); diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp index 76e95c1ed..4202da14f 100644 --- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.cpp @@ -279,8 +279,8 @@ void upsample_nearest2d_backward_kernel( [&] { using accscalar_t = acc_type_device; - const scalar_t* go = grad_output.data_ptr(); - scalar_t* gi = grad_input.data_ptr(); + const scalar_t* go = grad_output.const_data_ptr(); + scalar_t* gi = grad_input.mutable_data_ptr(); if (is_exact) { upsample_nearest2d_backward_channels_last_frame< scalar_t, @@ -331,8 +331,8 @@ void upsample_nearest2d_backward_kernel( [&] { using accscalar_t = acc_type_device; - auto idata = grad_input_c.data_ptr(); - auto odata = grad_output.data_ptr(); + auto idata = grad_input_c.mutable_data_ptr(); + auto odata = grad_output.const_data_ptr(); if (is_exact) { upsample_nearest2d_backward_frame( n, @@ -626,8 +626,8 @@ void upsample_nearest2d_kernel( input.scalar_type(), "upsample_nearest2d_channels_last_xpu", [&] { - const scalar_t* idata = input.data_ptr(); - scalar_t* odata = output.data_ptr(); + const scalar_t* idata = input.const_data_ptr(); + scalar_t* odata = output.mutable_data_ptr(); if (is_exact) { upsample_nearest2d_channels_last_frame( idata, @@ -672,8 +672,8 @@ void upsample_nearest2d_kernel( input.scalar_type(), "upsample_nearest2d_xpu", [&] { - auto idata = input.data_ptr(); - auto odata = output_c.data_ptr(); + auto idata = input.const_data_ptr(); + auto odata = output_c.mutable_data_ptr(); if (is_exact) { upsample_nearest2d_frame( idata, diff --git a/test/xpu/extended/skip_list_common.py b/test/xpu/extended/skip_list_common.py index 4683a39d4..6d51f34fd 100644 --- a/test/xpu/extended/skip_list_common.py +++ b/test/xpu/extended/skip_list_common.py @@ -62,7 +62,9 @@ # XPU Tensor fails in copy-on-write cases # AssertionError: False is not true : Keyword argument 'output grad 0' during backward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_backward=False` in this operation's OpInfo, add the arg to the OpInfo's `allow_cow_input_materialize_backward` list, or change the implementation to avoid materialization. # https://github.com/intel/torch-xpu-ops/issues/281 - "test_cow_input", + "test_cow_input_addr_xpu_float32", + "test_cow_input_cdist_xpu_float32", + "test_cow_input_nn_functional_multi_head_attention_forward_xpu_float32", # XPU implementation is correct. # std::exp{-inf, nan}, the result is (±0,±0) (signs are unspecified) # std::exp{-inf, inf}, the result is (±0,±0) (signs are unspecified) @@ -116,10 +118,6 @@ "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_bfloat16", # RuntimeError: "compute_index_ranges_weights" not implemented for 'Half' "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float16", - # AssertionError: False is not true : Argument 0 during forward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_forward=False... - "test_cow_input_nn_functional_interpolate_bilinear_xpu_float32", - "test_cow_input_nn_functional_interpolate_linear_xpu_float32", - "test_cow_input_nn_functional_interpolate_trilinear_xpu_float32", #The results of XPU and CUDA are consistent, but the results of CPU and CUDA are inconsistent "test_compare_cpu_nn_functional_interpolate_linear_xpu_bfloat16", "test_compare_cpu_nn_functional_interpolate_linear_xpu_float16",