diff --git a/.github/workflows/ubuntu-sycl.yml b/.github/workflows/ubuntu-sycl.yml index daad144ce08..bce4e90f4d7 100644 --- a/.github/workflows/ubuntu-sycl.yml +++ b/.github/workflows/ubuntu-sycl.yml @@ -49,6 +49,26 @@ jobs: docker/docker_test.sh sycl-static fi + - name: Upload Python wheel and C++ binary package to GitHub artifacts + if: ${{ matrix.BUILD_SHARED_LIBS == 'ON' }} + uses: actions/upload-artifact@v4 + with: + name: open3d-sycl-linux-wheel-and-binary + path: | + open3d-*.whl + open3d-*.tar.xz + if-no-files-found: error + - name: Update devel release + if: ${{ github.ref == 'refs/heads/main' && matrix.BUILD_SHARED_LIBS == 'ON' }} + env: + GH_TOKEN: ${{ github.token }} + run: | + if [ ${{ matrix.BUILD_SHARED_LIBS }} == 'ON' ] ; then + gh release upload main-devel open3d-*.whl --clobber + gh release upload main-devel open3d-*.tar.xz --clobber + fi + gh release view main-devel + - name: GCloud CLI auth if: ${{ github.ref == 'refs/heads/main' }} uses: 'google-github-actions/auth@v2' diff --git a/3rdparty/find_dependencies.cmake b/3rdparty/find_dependencies.cmake index 9beb7da33c7..a782265a553 100644 --- a/3rdparty/find_dependencies.cmake +++ b/3rdparty/find_dependencies.cmake @@ -1534,12 +1534,17 @@ open3d_import_3rdparty_library(3rdparty_uvatlas list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_uvatlas) +# SYCL link options are specified here. Compile options are only applied to SYCL source files and are specified in cmake/Open3DSYCLTargetSources.cmake if(BUILD_SYCL_MODULE) add_library(3rdparty_sycl INTERFACE) target_link_libraries(3rdparty_sycl INTERFACE $<$,$>>:sycl>) target_link_options(3rdparty_sycl INTERFACE - $<$,$>>:-fsycl -fsycl-targets=spir64_x86_64>) + $<$,$>>:-fsycl -fsycl-targets=${OPEN3D_SYCL_TARGETS}>) + if (OPEN3D_SYCL_TARGET_BACKEND_OPTIONS) + target_link_options(3rdparty_sycl INTERFACE + $<$,$>>:-Xs ${OPEN3D_SYCL_TARGET_BACKEND_OPTIONS}>) + endif() if(NOT BUILD_SHARED_LIBS OR arg_PUBLIC) install(TARGETS 3rdparty_sycl EXPORT Open3DTargets) endif() @@ -1565,8 +1570,12 @@ if(OPEN3D_USE_ONEAPI_PACKAGES) GROUPED INCLUDE_DIRS ${MKL_INCLUDE}/ LIB_DIR ${MKL_ROOT}/lib/intel64 - LIBRARIES mkl_intel_ilp64 mkl_tbb_thread mkl_core + LIBRARIES $<$:mkl_sycl> mkl_intel_ilp64 mkl_tbb_thread mkl_core ) + if (BUILD_SYCL_MODULE) + # target_link_options(3rdparty_mkl INTERFACE "-Wl,-export-dynamic") + target_link_libraries(3rdparty_mkl INTERFACE OpenCL) + endif() # MKL definitions target_compile_options(3rdparty_mkl INTERFACE "$<$:$<$:-m64>>") target_compile_definitions(3rdparty_mkl INTERFACE "$<$:MKL_ILP64>") diff --git a/CMakeLists.txt b/CMakeLists.txt index e1bd4706288..354125dc01d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,6 +63,16 @@ else() option(STATIC_WINDOWS_RUNTIME "Use static (MT/MTd) Windows runtime" ON ) endif() option(BUILD_SYCL_MODULE "Build SYCL module with Intel oneAPI" OFF) +if(BUILD_SYCL_MODULE) + set(OPEN3D_SYCL_TARGETS "spir64" CACHE STRING + "SYCL targets: spir64 for JIT, or another for AOT compilation. See https://github.com/intel/llvm/blob/sycl/sycl/doc/UsersManual.md." +) + set(OPEN3D_SYCL_TARGET_BACKEND_OPTIONS "" CACHE STRING + "SYCL target backend options, e.g. to compile for a specific device. See https://github.com/intel/llvm/blob/sycl/sycl/doc/UsersManual.md." +) + set(BUILD_ISPC_MODULE OFF CACHE BOOL "Build the ISPC module" FORCE) + set(BUILD_CUDA_MODULE OFF CACHE BOOL "Build the CUDA module" FORCE) +endif() option(GLIBCXX_USE_CXX11_ABI "Set -D_GLIBCXX_USE_CXX11_ABI=1" ON ) option(ENABLE_SYCL_UNIFIED_SHARED_MEMORY "Enable SYCL unified shared memory" OFF) if(BUILD_GUI AND (WIN32 OR UNIX AND NOT LINUX_AARCH64 AND NOT APPLE_AARCH64)) @@ -282,12 +292,6 @@ endif() if(BUILD_SYCL_MODULE AND NOT GLIBCXX_USE_CXX11_ABI) message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires GLIBCXX_USE_CXX11_ABI=ON") endif() -if(BUILD_SYCL_MODULE AND BUILD_TENSORFLOW_OPS) - message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_TENSORFLOW_OPS=OFF") -endif() -if(BUILD_SYCL_MODULE AND BUILD_PYTORCH_OPS) - message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_PYTORCH_OPS=OFF") -endif() if(BUILD_SYCL_MODULE AND BUILD_CUDA_MODULE) message(FATAL_ERROR "BUILD_SYCL_MODULE and BUILD_SYCL_MODULE cannot be on at the same time for now.") endif() diff --git a/cmake/Open3DSYCLTargetSources.cmake b/cmake/Open3DSYCLTargetSources.cmake index 9b0220c10ab..3314cb24303 100644 --- a/cmake/Open3DSYCLTargetSources.cmake +++ b/cmake/Open3DSYCLTargetSources.cmake @@ -2,7 +2,8 @@ # # When BUILD_SYCL_MODULE=ON, set SYCL-specific compile flags for the listed # source files and call target_sources(). If BUILD_SYCL_MODULE=OFF, this -# function directly calls target_sources(). +# function directly calls target_sources(). For SYCL link options, see +# 3rdparty/find_dependencies.cmake # # Note: this is not a perfect forwarding to target_sources(), as it only support # limited set of arguments. See the example usage below. @@ -31,7 +32,7 @@ function(open3d_sycl_target_sources target) if(BUILD_SYCL_MODULE) foreach(sycl_file IN LISTS arg_UNPARSED_ARGUMENTS) set_source_files_properties(${sycl_file} PROPERTIES - COMPILE_OPTIONS -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64_x86_64) + COMPILE_OPTIONS "-fsycl;-fsycl-targets=${OPEN3D_SYCL_TARGETS}") if(arg_VERBOSE) message(STATUS "open3d_sycl_target_sources(${target}): marked ${sycl_file} as SYCL code") endif() diff --git a/cmake/Open3DSetGlobalProperties.cmake b/cmake/Open3DSetGlobalProperties.cmake index 03e43212a8b..ce8ff8d217e 100644 --- a/cmake/Open3DSetGlobalProperties.cmake +++ b/cmake/Open3DSetGlobalProperties.cmake @@ -25,15 +25,16 @@ function(open3d_enable_strip target) endif() endfunction() -# RPATH handling (for TBB DSO). Check current folder, one folder above and the lib sibling folder +# RPATH handling (for TBB DSO). Check current folder, one folder above and the lib sibling folder. +# Also check the Python virtual env /lib folder for 3rd party dependency libraries installed with `pip install` set(CMAKE_BUILD_RPATH_USE_ORIGIN ON) if (APPLE) -# Add options to cover the various ways in which open3d shaed lib or apps can be installed wrt TBB DSO - set(CMAKE_INSTALL_RPATH "@loader_path;@loader_path/../;@loader_path/../lib/") +# Add options to cover the various ways in which open3d shared lib or apps can be installed wrt TBB DSO + set(CMAKE_INSTALL_RPATH "@loader_path;@loader_path/../;@loader_path/../lib/;@loader_path/../../../../") # pybind with open3d shared lib is copied, not cmake-installed, so we need to add .. to build rpath set(CMAKE_BUILD_RPATH "@loader_path/../") elseif(UNIX) - set(CMAKE_INSTALL_RPATH "$ORIGIN;$ORIGIN/../;$ORIGIN/../lib/") + set(CMAKE_INSTALL_RPATH "$ORIGIN;$ORIGIN/../;$ORIGIN/../lib/;$ORIGIN/../../../../") set(CMAKE_BUILD_RPATH "$ORIGIN/../") endif() diff --git a/cpp/apps/CMakeLists.txt b/cpp/apps/CMakeLists.txt index a0efa960d3c..f0349b60961 100644 --- a/cpp/apps/CMakeLists.txt +++ b/cpp/apps/CMakeLists.txt @@ -130,6 +130,11 @@ macro(open3d_add_app_common SRC_DIR APP_NAME TARGET_NAME) target_link_libraries(${TARGET_NAME} PRIVATE Open3D::Open3D TBB::tbb ${ARGN}) set_target_properties(${TARGET_NAME} PROPERTIES FOLDER "apps") + if (BUILD_SYCL_MODULE) + find_package(IntelSYCL REQUIRED) # requires cmake>=3.25 on Windows + add_sycl_to_target(TARGET ${TARGET_NAME}) + endif() + open3d_link_3rdparty_libraries(${TARGET_NAME}) open3d_show_and_abort_on_warning(${TARGET_NAME}) open3d_set_global_properties(${TARGET_NAME}) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 89178ce97cd..997b3dc9ab5 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -14,6 +14,10 @@ if (BUILD_CUDA_MODULE) find_package(CUDAToolkit REQUIRED) target_link_libraries(benchmarks PRIVATE CUDA::cudart) endif() +if (BUILD_SYCL_MODULE) + find_package(IntelSYCL REQUIRED) # requires cmake>=3.25 on Windows + add_sycl_to_target(TARGET benchmarks) +endif() open3d_show_and_abort_on_warning(benchmarks) open3d_set_global_properties(benchmarks) diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt index c34cea3fc6d..210cc33058a 100644 --- a/cpp/open3d/core/CMakeLists.txt +++ b/cpp/open3d/core/CMakeLists.txt @@ -5,7 +5,6 @@ target_sources(core PRIVATE CUDAUtils.cpp Device.cpp Dtype.cpp - EigenConverter.cpp Indexer.cpp MemoryManager.cpp MemoryManagerCached.cpp @@ -23,6 +22,7 @@ target_sources(core PRIVATE # Compile regardless BUILD_SYCL_MODULE == ON or OFF. open3d_sycl_target_sources(core PRIVATE + EigenConverter.cpp SYCLUtils.cpp ) @@ -43,22 +43,14 @@ target_sources(core PRIVATE hashmap/HashSet.cpp kernel/Kernel.cpp linalg/AddMM.cpp - linalg/AddMMCPU.cpp linalg/Det.cpp linalg/Inverse.cpp - linalg/InverseCPU.cpp linalg/LeastSquares.cpp - linalg/LeastSquaresCPU.cpp linalg/LU.cpp - linalg/LUCPU.cpp linalg/Matmul.cpp - linalg/MatmulCPU.cpp linalg/Solve.cpp - linalg/SolveCPU.cpp linalg/SVD.cpp - linalg/SVDCPU.cpp linalg/Tri.cpp - linalg/TriCPU.cpp nns/FixedRadiusIndex.cpp nns/FixedRadiusSearchOps.cpp nns/KnnIndex.cpp @@ -73,21 +65,48 @@ set_target_properties(core_impl PROPERTIES CXX_VISIBILITY_PRESET "hidden") target_sources(core_impl PRIVATE kernel/Arange.cpp - kernel/ArangeCPU.cpp kernel/BinaryEW.cpp - kernel/BinaryEWCPU.cpp kernel/IndexGetSet.cpp - kernel/IndexGetSetCPU.cpp kernel/IndexReduction.cpp - kernel/IndexReductionCPU.cpp kernel/NonZero.cpp - kernel/NonZeroCPU.cpp kernel/Reduction.cpp - kernel/ReductionCPU.cpp kernel/UnaryEW.cpp + kernel/ArangeCPU.cpp + kernel/BinaryEWCPU.cpp + kernel/IndexGetSetCPU.cpp + kernel/IndexReductionCPU.cpp + kernel/NonZeroCPU.cpp + kernel/ReductionCPU.cpp kernel/UnaryEWCPU.cpp + linalg/AddMMCPU.cpp + linalg/InverseCPU.cpp + linalg/LeastSquaresCPU.cpp + linalg/LUCPU.cpp + linalg/MatmulCPU.cpp + linalg/SolveCPU.cpp + linalg/SVDCPU.cpp + linalg/TriCPU.cpp +) + +if (BUILD_SYCL_MODULE) +open3d_sycl_target_sources(core_impl PRIVATE kernel/UnaryEWSYCL.cpp + kernel/BinaryEWSYCL.cpp + kernel/ArangeSYCL.cpp + kernel/IndexGetSetSYCL.cpp + kernel/NonZeroSYCL.cpp + kernel/IndexReductionSYCL.cpp + kernel/ReductionSYCL.cpp + linalg/AddMMSYCL.cpp + linalg/InverseSYCL.cpp + linalg/LeastSquaresSYCL.cpp + linalg/LUSYCL.cpp + linalg/MatmulSYCL.cpp + linalg/SolveSYCL.cpp + linalg/SVDSYCL.cpp + linalg/TriSYCL.cpp ) +endif() if (BUILD_CUDA_MODULE) target_sources(core PRIVATE diff --git a/cpp/open3d/core/Device.h b/cpp/open3d/core/Device.h index a79d9cf646c..d215d16e55e 100644 --- a/cpp/open3d/core/Device.h +++ b/cpp/open3d/core/Device.h @@ -115,4 +115,12 @@ struct hash { return std::hash{}(device.ToString()); } }; + +template <> +struct less { + bool operator()(const open3d::core::Device& lhs, + const open3d::core::Device& rhs) const { + return lhs.ToString() < rhs.ToString(); + } +}; } // namespace std diff --git a/cpp/open3d/core/Indexer.h b/cpp/open3d/core/Indexer.h index c9eb8933d49..298997117b5 100644 --- a/cpp/open3d/core/Indexer.h +++ b/cpp/open3d/core/Indexer.h @@ -34,11 +34,11 @@ class Indexer; class IndexerIterator; // Maximum number of dimensions of TensorRef. -static constexpr int64_t MAX_DIMS = 10; +static constexpr int64_t MAX_DIMS = 5; // Maximum number of inputs of an op. // MAX_INPUTS shall be >= MAX_DIMS to support advanced indexing. -static constexpr int64_t MAX_INPUTS = 10; +static constexpr int64_t MAX_INPUTS = 5; // Maximum number of outputs of an op. This number can be increased when // necessary. @@ -110,7 +110,7 @@ struct TensorRef { TensorRef(const Tensor& t) { if (t.NumDims() > MAX_DIMS) { - utility::LogError("Tenor has too many dimensions {} > {}.", + utility::LogError("Tensor has too many dimensions {} > {}.", t.NumDims(), MAX_DIMS); } data_ptr_ = const_cast(t.GetDataPtr()); diff --git a/cpp/open3d/core/Indexer.isph b/cpp/open3d/core/Indexer.isph index c9b63486377..af7629d3f06 100644 --- a/cpp/open3d/core/Indexer.isph +++ b/cpp/open3d/core/Indexer.isph @@ -11,11 +11,11 @@ #include "open3d/utility/Helper.isph" // Maximum number of dimensions of TensorRef. -enum { MAX_DIMS = 10 }; +enum { MAX_DIMS = 4 }; // Maximum number of inputs of an op. // MAX_INPUTS shall be >= MAX_DIMS to support advanced indexing. -enum { MAX_INPUTS = 10 }; +enum { MAX_INPUTS = 4 }; // Maximum number of outputs of an op. This number can be increased when // necessary. diff --git a/cpp/open3d/core/ParallelFor.h b/cpp/open3d/core/ParallelFor.h index 2d5bef78812..9e917789947 100644 --- a/cpp/open3d/core/ParallelFor.h +++ b/cpp/open3d/core/ParallelFor.h @@ -99,6 +99,11 @@ void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) { /// \note If you use a lambda function, capture only the required variables /// instead of all to prevent accidental race conditions. If you want the /// kernel to be used on both CPU and CUDA, capture the variables by value. +/// \note This does not dispatch to SYCL, since SYCL has extra constraints: +/// - Lambdas may capture by value only. +/// - No function pointers / virtual functions. +/// Auto dispatch to SYCL will enforce these conditions even on CPU devices. Use +/// ParallelForSYCL instead. template void ParallelFor(const Device& device, int64_t n, const func_t& func) { #ifdef __CUDACC__ diff --git a/cpp/open3d/core/ParallelForSYCL.h b/cpp/open3d/core/ParallelForSYCL.h new file mode 100644 index 00000000000..d74d4853734 --- /dev/null +++ b/cpp/open3d/core/ParallelForSYCL.h @@ -0,0 +1,63 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#pragma once + +#include +#include + +#include "open3d/core/Device.h" +#include "open3d/core/Indexer.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { + +/// Run a function in parallel with SYCL. +template +void ParallelForSYCL(const Device& device, + Indexer indexer, + FuncArgs... func_args) { + if (!device.IsSYCL()) { + utility::LogError("ParallelFor for SYCL cannot run on device {}.", + device.ToString()); + } + int64_t n = indexer.NumWorkloads(); + if (n == 0) { + return; + } + auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + /// TODO: Specify grid size based on device properties + queue.parallel_for(n, [indexer, func_args...](int64_t i) { + Functor ef(indexer, func_args...); + ef(i); + }).wait_and_throw(); +} + +/// Run a function in parallel with SYCL. +template +void ParallelForSYCL(const Device& device, + int64_t num_workloads, + FuncArgs... func_args) { + if (!device.IsSYCL()) { + utility::LogError("ParallelFor for SYCL cannot run on device {}.", + device.ToString()); + } + if (num_workloads == 0) { + return; + } + auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + /// TODO: Specify grid size based on device properties + queue.parallel_for(num_workloads, [func_args...](int64_t i) { + Functor ef(func_args...); + ef(i); + }).wait_and_throw(); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/SYCLContext.cpp b/cpp/open3d/core/SYCLContext.cpp index 30c5f6f6afc..55115746b35 100644 --- a/cpp/open3d/core/SYCLContext.cpp +++ b/cpp/open3d/core/SYCLContext.cpp @@ -19,6 +19,8 @@ namespace open3d { namespace core { namespace sy { +OPEN3D_DLL_LOCAL std::string GetDeviceTypeName(const sycl::device &device); + SYCLContext &SYCLContext::GetInstance() { static thread_local SYCLContext instance; return instance; @@ -27,19 +29,45 @@ SYCLContext &SYCLContext::GetInstance() { bool SYCLContext::IsAvailable() { return devices_.size() > 0; } bool SYCLContext::IsDeviceAvailable(const Device &device) { - bool rc = false; - for (const Device &device_ : devices_) { - if (device == device_) { - rc = true; - break; - } + return devices_.find(device) != devices_.end(); +} +std::vector SYCLContext::GetAvailableSYCLDevices() { + std::vector device_vec; + for (const auto &device : devices_) { + device_vec.push_back(device.first); } - return rc; + return device_vec; } -std::vector SYCLContext::GetAvailableSYCLDevices() { return devices_; } sycl::queue SYCLContext::GetDefaultQueue(const Device &device) { - return device_to_default_queue_.at(device); + return devices_.at(device).queue; +} + +SYCLDevice::SYCLDevice(const sycl::device &sycl_device) { + namespace sid = sycl::info::device; + device = sycl_device; + queue = sycl::queue(device); + name = device.get_info(); + device_type = GetDeviceTypeName(device); + max_work_group_size = device.get_info(); + auto aspects = device.get_info(); + fp64 = std::find(aspects.begin(), aspects.end(), sycl::aspect::fp64) != + aspects.end(); + if (!fp64) { + utility::LogWarning( + "SYCL device {} does not support double precision. Using " + "emulation.", + name); + } + usm_device_allocations = + std::find(aspects.begin(), aspects.end(), + sycl::aspect::usm_device_allocations) != aspects.end(); + if (!usm_device_allocations) { + utility::LogWarning( + "SYCL device {} does not support USM device allocations. " + "Open3D SYCL support may not work.", + name); + } } SYCLContext::SYCLContext() { @@ -48,9 +76,7 @@ SYCLContext::SYCLContext() { try { const sycl::device &sycl_device = sycl::device(sycl::gpu_selector_v); const Device open3d_device = Device("SYCL:0"); - devices_.push_back(open3d_device); - device_to_sycl_device_[open3d_device] = sycl_device; - device_to_default_queue_[open3d_device] = sycl::queue(sycl_device); + devices_.emplace(open3d_device, sycl_device); } catch (const sycl::exception &e) { } @@ -66,9 +92,7 @@ SYCLContext::SYCLContext() { const sycl::device &sycl_device = sycl::device(sycl::cpu_selector_v); const Device open3d_device = Device("SYCL:" + std::to_string(devices_.size())); - devices_.push_back(open3d_device); - device_to_sycl_device_[open3d_device] = sycl_device; - device_to_default_queue_[open3d_device] = sycl::queue(sycl_device); + devices_.emplace(open3d_device, sycl_device); } catch (const sycl::exception &e) { } diff --git a/cpp/open3d/core/SYCLContext.h b/cpp/open3d/core/SYCLContext.h index 341da29369e..c1a523f17e3 100644 --- a/cpp/open3d/core/SYCLContext.h +++ b/cpp/open3d/core/SYCLContext.h @@ -14,8 +14,8 @@ #pragma once +#include #include -#include #include "open3d/core/Device.h" @@ -23,6 +23,19 @@ namespace open3d { namespace core { namespace sy { +/// @brief SYCL device properties. +struct SYCLDevice { + SYCLDevice(const sycl::device& sycl_device); + std::string name; ///< Fiendlly / descriptive name of the device. + std::string device_type; ///< cpu, gpu, host, acc, custom, unknown. + sycl::device device; ///< SYCL device. + sycl::queue queue; ///< Default queue for this device. + size_t max_work_group_size; ///< Preferred work group size + bool fp64; ///< Double precision support, else need to emulate. + bool usm_device_allocations; ///< USM device allocations required for + ///< Open3D. +}; + /// Singleton SYCL context manager. It maintains: /// - A default queue for each SYCL device class SYCLContext { @@ -45,17 +58,16 @@ class SYCLContext { /// Get the default SYCL queue given an Open3D device. sycl::queue GetDefaultQueue(const Device& device); + /// Get SYCL device properties given an Open3D device. + SYCLDevice GetDeviceProperties(const Device& device) { + return devices_.at(device); + }; + private: SYCLContext(); - /// List of available Open3D SYCL devices. - std::vector devices_; - - /// Maps core::Device to the corresponding default SYCL queue. - std::unordered_map device_to_default_queue_; - - /// Maps core::Device to sycl::device. Internal use only for now. - std::unordered_map device_to_sycl_device_; + /// Map from available Open3D SYCL devices to their properties. + std::map devices_; }; } // namespace sy diff --git a/cpp/open3d/core/SYCLUtils.cpp b/cpp/open3d/core/SYCLUtils.cpp index 011ee5eb653..3a62730239d 100644 --- a/cpp/open3d/core/SYCLUtils.cpp +++ b/cpp/open3d/core/SYCLUtils.cpp @@ -84,7 +84,7 @@ int SYCLDemo() { #ifdef BUILD_SYCL_MODULE -static std::string GetDeviceTypeName(const sycl::device &device) { +OPEN3D_DLL_LOCAL std::string GetDeviceTypeName(const sycl::device &device) { auto device_type = device.get_info(); switch (device_type) { case sycl::info::device_type::cpu: @@ -95,6 +95,8 @@ static std::string GetDeviceTypeName(const sycl::device &device) { return "host"; case sycl::info::device_type::accelerator: return "acc"; + case sycl::info::device_type::custom: + return "custom"; default: return "unknown"; } @@ -210,6 +212,20 @@ bool IsDeviceAvailable(const Device &device) { #endif } +std::string GetDeviceType(const Device &device) { +#ifdef BUILD_SYCL_MODULE + if (IsDeviceAvailable(device)) { + return SYCLContext::GetInstance() + .GetDeviceProperties(device) + .device_type; + } else { + return ""; + } +#else + return ""; +#endif +} + std::vector GetAvailableSYCLDevices() { #ifdef BUILD_SYCL_MODULE return SYCLContext::GetInstance().GetAvailableSYCLDevices(); diff --git a/cpp/open3d/core/SYCLUtils.h b/cpp/open3d/core/SYCLUtils.h index c6e34eefe06..72d16a3bbaf 100644 --- a/cpp/open3d/core/SYCLUtils.h +++ b/cpp/open3d/core/SYCLUtils.h @@ -37,6 +37,10 @@ bool IsAvailable(); /// Returns true if the specified SYCL device is available. bool IsDeviceAvailable(const Device& device); +/// Returns the device type (cpu / gpu / accelerator / custom) of the specified +/// device as a string. Returns empty string if the device is not available. +std::string GetDeviceType(const Device& device); + /// Return a list of available SYCL devices. std::vector GetAvailableSYCLDevices(); diff --git a/cpp/open3d/core/Tensor.h b/cpp/open3d/core/Tensor.h index 7a20da6b5b2..445676d59ce 100644 --- a/cpp/open3d/core/Tensor.h +++ b/cpp/open3d/core/Tensor.h @@ -1329,7 +1329,7 @@ class Tensor : public IsDevice { /// Underlying memory buffer for Tensor. std::shared_ptr blob_ = nullptr; -}; // namespace core +}; template <> inline Tensor::Tensor(const std::vector& init_vals, @@ -1425,5 +1425,11 @@ inline Tensor operator/(T scalar_lhs, const Tensor& rhs) { return Tensor::Full({}, scalar_lhs, rhs.GetDtype(), rhs.GetDevice()) / rhs; } +inline void AssertNotSYCL(const Tensor& tensor) { + if (tensor.GetDevice().IsSYCL()) { + utility::LogError("Not supported for SYCL device."); + } +} + } // namespace core -} // namespace open3d +} // namespace open3d \ No newline at end of file diff --git a/cpp/open3d/core/kernel/Arange.cpp b/cpp/open3d/core/kernel/Arange.cpp index b3385a2a10c..3c85401eece 100644 --- a/cpp/open3d/core/kernel/Arange.cpp +++ b/cpp/open3d/core/kernel/Arange.cpp @@ -63,6 +63,12 @@ Tensor Arange(const Tensor& start, const Tensor& stop, const Tensor& step) { if (device.IsCPU()) { ArangeCPU(start, stop, step, dst); + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ArangeSYCL(start, stop, step, dst); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else if (device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE ArangeCUDA(start, stop, step, dst); diff --git a/cpp/open3d/core/kernel/Arange.h b/cpp/open3d/core/kernel/Arange.h index 1a0e88323fd..4547960b5f4 100644 --- a/cpp/open3d/core/kernel/Arange.h +++ b/cpp/open3d/core/kernel/Arange.h @@ -20,6 +20,13 @@ void ArangeCPU(const Tensor& start, const Tensor& step, Tensor& dst); +#ifdef BUILD_SYCL_MODULE +void ArangeSYCL(const Tensor& start, + const Tensor& stop, + const Tensor& step, + Tensor& dst); +#endif + #ifdef BUILD_CUDA_MODULE void ArangeCUDA(const Tensor& start, const Tensor& stop, diff --git a/cpp/open3d/core/kernel/ArangeSYCL.cpp b/cpp/open3d/core/kernel/ArangeSYCL.cpp new file mode 100644 index 00000000000..cda8912be90 --- /dev/null +++ b/cpp/open3d/core/kernel/ArangeSYCL.cpp @@ -0,0 +1,37 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/kernel/Arange.h" + +namespace open3d { +namespace core { +namespace kernel { + +void ArangeSYCL(const Tensor& start, + const Tensor& stop, + const Tensor& step, + Tensor& dst) { + Dtype dtype = start.GetDtype(); + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(start.GetDevice()); + DISPATCH_DTYPE_TO_TEMPLATE(dtype, [&]() { + scalar_t sstart = start.Item(); + scalar_t sstep = step.Item(); + scalar_t* dst_ptr = dst.GetDataPtr(); + int64_t n = dst.GetLength(); + queue.parallel_for(n, [=](int64_t i) { + dst_ptr[i] = sstart + static_cast(sstep * i); + }).wait_and_throw(); + }); +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/BinaryEW.cpp b/cpp/open3d/core/kernel/BinaryEW.cpp index 00eb9b388e5..e34122cd137 100644 --- a/cpp/open3d/core/kernel/BinaryEW.cpp +++ b/cpp/open3d/core/kernel/BinaryEW.cpp @@ -51,6 +51,12 @@ void BinaryEW(const Tensor& lhs, if (lhs.IsCPU()) { BinaryEWCPU(lhs, rhs, dst, op_code); + } else if (lhs.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + BinaryEWSYCL(lhs, rhs, dst, op_code); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else if (lhs.IsCUDA()) { #ifdef BUILD_CUDA_MODULE BinaryEWCUDA(lhs, rhs, dst, op_code); diff --git a/cpp/open3d/core/kernel/BinaryEW.h b/cpp/open3d/core/kernel/BinaryEW.h index fbd4af3e849..2f2cb57888e 100644 --- a/cpp/open3d/core/kernel/BinaryEW.h +++ b/cpp/open3d/core/kernel/BinaryEW.h @@ -48,6 +48,13 @@ void BinaryEWCPU(const Tensor& lhs, Tensor& dst, BinaryEWOpCode op_code); +#ifdef BUILD_SYCL_MODULE +void BinaryEWSYCL(const Tensor& lhs, + const Tensor& rhs, + Tensor& dst, + BinaryEWOpCode op_code); +#endif + #ifdef BUILD_CUDA_MODULE void BinaryEWCUDA(const Tensor& lhs, const Tensor& rhs, diff --git a/cpp/open3d/core/kernel/BinaryEWSYCL.cpp b/cpp/open3d/core/kernel/BinaryEWSYCL.cpp new file mode 100644 index 00000000000..1466b11f97e --- /dev/null +++ b/cpp/open3d/core/kernel/BinaryEWSYCL.cpp @@ -0,0 +1,270 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Dispatch.h" +#include "open3d/core/Dtype.h" +#include "open3d/core/Indexer.h" +#include "open3d/core/MemoryManager.h" +#include "open3d/core/ParallelForSYCL.h" +#include "open3d/core/SizeVector.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/kernel/BinaryEW.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +namespace kernel { + +namespace { + +struct BinaryElementKernel { + void operator()(int64_t i) {} + BinaryElementKernel(Indexer indexer_) : indexer(indexer_) {} + +protected: + Indexer indexer; +}; + +// Min, Max +#define BINARY_ELEMENT_KERNEL(name, elem_fn) \ + template \ + struct name##ElementKernel : public BinaryElementKernel { \ + using BinaryElementKernel::BinaryElementKernel; \ + void operator()(int64_t i) { \ + const src_t* lhs = indexer.GetInputPtr(0, i); \ + const src_t* rhs = indexer.GetInputPtr(1, i); \ + dst_t* dst = indexer.GetOutputPtr(i); \ + *dst = elem_fn(*lhs, *rhs); \ + } \ + } + +BINARY_ELEMENT_KERNEL(Max, sycl::max); +BINARY_ELEMENT_KERNEL(Min, sycl::min); +#undef BINARY_ELEMENT_KERNEL + +/// Specialize Min, Max for Bool, since sycl::min, sycl::max do not support it. +template <> +struct MaxElementKernel : public BinaryElementKernel { + using BinaryElementKernel::BinaryElementKernel; + void operator()(int64_t i) { + const bool* lhs = indexer.GetInputPtr(0, i); + const bool* rhs = indexer.GetInputPtr(1, i); + bool* dst = indexer.GetOutputPtr(i); + *dst = *lhs || *rhs; + } +}; +template <> +struct MinElementKernel : public BinaryElementKernel { + using BinaryElementKernel::BinaryElementKernel; + void operator()(int64_t i) { + const bool* lhs = indexer.GetInputPtr(0, i); + const bool* rhs = indexer.GetInputPtr(1, i); + bool* dst = indexer.GetOutputPtr(i); + *dst = *lhs && *rhs; + } +}; + +// Arithmetic and Relational ops. +#define BINARY_ELEMENT_KERNEL(name, elem_op) \ + template \ + struct name##ElementKernel : public BinaryElementKernel { \ + using BinaryElementKernel::BinaryElementKernel; \ + void operator()(int64_t i) { \ + const src_t* lhs = indexer.GetInputPtr(0, i); \ + const src_t* rhs = indexer.GetInputPtr(1, i); \ + dst_t* dst = indexer.GetOutputPtr(i); \ + *dst = (*lhs)elem_op(*rhs); \ + } \ + } + +BINARY_ELEMENT_KERNEL(Add, +); +BINARY_ELEMENT_KERNEL(Sub, -); +BINARY_ELEMENT_KERNEL(Mul, *); +BINARY_ELEMENT_KERNEL(Div, /); +BINARY_ELEMENT_KERNEL(Gt, >); +BINARY_ELEMENT_KERNEL(Lt, <); +BINARY_ELEMENT_KERNEL(Geq, >=); +BINARY_ELEMENT_KERNEL(Leq, <=); +BINARY_ELEMENT_KERNEL(Eq, ==); +BINARY_ELEMENT_KERNEL(Neq, !=); +#undef BINARY_ELEMENT_KERNEL + +// Logical ops. +#define BINARY_ELEMENT_KERNEL(name, elem_op) \ + template \ + struct name##ElementKernel : public BinaryElementKernel { \ + using BinaryElementKernel::BinaryElementKernel; \ + void operator()(int64_t i) { \ + const src_t* lhs = indexer.GetInputPtr(0, i); \ + const src_t* rhs = indexer.GetInputPtr(1, i); \ + dst_t* dst = indexer.GetOutputPtr(i); \ + *dst = static_cast(*lhs) elem_op static_cast(*rhs); \ + } \ + } +BINARY_ELEMENT_KERNEL(LogicalAnd, &&); +BINARY_ELEMENT_KERNEL(LogicalOr, ||); +BINARY_ELEMENT_KERNEL(LogicalXor, !=); +#undef BINARY_ELEMENT_KERNEL + +} // namespace + +void BinaryEWSYCL(const Tensor& lhs, + const Tensor& rhs, + Tensor& dst, + BinaryEWOpCode op_code) { + Dtype src_dtype = lhs.GetDtype(); + Dtype dst_dtype = dst.GetDtype(); + Device device = lhs.GetDevice(); + + if (s_boolean_binary_ew_op_codes.find(op_code) != + s_boolean_binary_ew_op_codes.end()) { + if (dst_dtype == src_dtype) { + // Inplace boolean op's output type is the same as the + // input. e.g. np.logical_and(a, b, out=a), where a, b are + // floats. + Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + switch (op_code) { + case BinaryEWOpCode::LogicalAnd: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::LogicalOr: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::LogicalXor: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Gt: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Lt: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Ge: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Le: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Eq: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Ne: + ParallelForSYCL>(device, + indexer); + break; + default: + break; + } + }); + } else if (dst_dtype == core::Bool) { + // By default, output is boolean type. + Indexer indexer({lhs, rhs}, dst, + DtypePolicy::INPUT_SAME_OUTPUT_BOOL); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + switch (op_code) { + case BinaryEWOpCode::LogicalAnd: + ParallelForSYCL< + LogicalAndElementKernel>( + device, indexer); + break; + case BinaryEWOpCode::LogicalOr: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::LogicalXor: + ParallelForSYCL< + LogicalXorElementKernel>( + device, indexer); + break; + case BinaryEWOpCode::Gt: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Lt: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Ge: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Le: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Eq: + ParallelForSYCL>( + device, indexer); + break; + case BinaryEWOpCode::Ne: + ParallelForSYCL>( + device, indexer); + break; + default: + break; + } + }); + } else { + utility::LogError( + "Boolean op's output type must be boolean or the " + "same type as the input."); + } + } else if (op_code == BinaryEWOpCode::Maximum || + op_code == BinaryEWOpCode::Minimum) { + Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + switch (op_code) { + case BinaryEWOpCode::Maximum: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Minimum: + ParallelForSYCL>(device, + indexer); + break; + default: + break; + } + }); + } else { + Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME); + DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() { + switch (op_code) { + case BinaryEWOpCode::Add: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Sub: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Mul: + ParallelForSYCL>(device, + indexer); + break; + case BinaryEWOpCode::Div: + ParallelForSYCL>(device, + indexer); + break; + default: + break; + } + }); + } +} +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/IndexGetSet.cpp b/cpp/open3d/core/kernel/IndexGetSet.cpp index 601b4d73f64..b880f50fb51 100644 --- a/cpp/open3d/core/kernel/IndexGetSet.cpp +++ b/cpp/open3d/core/kernel/IndexGetSet.cpp @@ -35,6 +35,10 @@ void IndexGet(const Tensor& src, if (src.IsCPU()) { IndexGetCPU(src, dst, index_tensors, indexed_shape, indexed_strides); + } else if (src.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + IndexGetSYCL(src, dst, index_tensors, indexed_shape, indexed_strides); +#endif } else if (src.IsCUDA()) { #ifdef BUILD_CUDA_MODULE IndexGetCUDA(src, dst, index_tensors, indexed_shape, indexed_strides); @@ -56,6 +60,11 @@ void IndexSet(const Tensor& src, if (dst.IsCPU()) { IndexSetCPU(src_same_device, dst, index_tensors, indexed_shape, indexed_strides); + } else if (dst.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + IndexSetSYCL(src_same_device, dst, index_tensors, indexed_shape, + indexed_strides); +#endif } else if (dst.IsCUDA()) { #ifdef BUILD_CUDA_MODULE IndexSetCUDA(src_same_device, dst, index_tensors, indexed_shape, diff --git a/cpp/open3d/core/kernel/IndexGetSet.h b/cpp/open3d/core/kernel/IndexGetSet.h index 130b80836a9..c4a6d3b22c2 100644 --- a/cpp/open3d/core/kernel/IndexGetSet.h +++ b/cpp/open3d/core/kernel/IndexGetSet.h @@ -26,6 +26,14 @@ void IndexGetCPU(const Tensor& src, const SizeVector& indexed_shape, const SizeVector& indexed_strides); +#ifdef BUILD_SYCL_MODULE +void IndexGetSYCL(const Tensor& src, + Tensor& dst, + const std::vector& index_tensors, + const SizeVector& indexed_shape, + const SizeVector& indexed_strides); +#endif + #ifdef BUILD_CUDA_MODULE void IndexGetCUDA(const Tensor& src, Tensor& dst, @@ -46,6 +54,14 @@ void IndexSetCPU(const Tensor& src, const SizeVector& indexed_shape, const SizeVector& indexed_strides); +#ifdef BUILD_SYCL_MODULE +void IndexSetSYCL(const Tensor& src, + Tensor& dst, + const std::vector& index_tensors, + const SizeVector& indexed_shape, + const SizeVector& indexed_strides); +#endif + #ifdef BUILD_CUDA_MODULE void IndexSetCUDA(const Tensor& src, Tensor& dst, diff --git a/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp b/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp new file mode 100644 index 00000000000..dfc56397417 --- /dev/null +++ b/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp @@ -0,0 +1,76 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/AdvancedIndexing.h" +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/kernel/IndexGetSet.h" + +namespace open3d { +namespace core { +namespace kernel { + +void IndexGetSYCL(const Tensor& src, + Tensor& dst, + const std::vector& index_tensors, + const SizeVector& indexed_shape, + const SizeVector& indexed_strides) { + Dtype dtype = src.GetDtype(); + AdvancedIndexer ai(src, dst, index_tensors, indexed_shape, indexed_strides, + AdvancedIndexer::AdvancedIndexerMode::GET); + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice()); + if (dtype.IsObject()) { + int64_t object_byte_size = dtype.ByteSize(); + for (int64_t idx = 0; idx < ai.NumWorkloads(); ++idx) { + queue.memcpy(ai.GetOutputPtr(idx), ai.GetInputPtr(idx), + object_byte_size); + } + } else { + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dtype, [&]() { + queue.parallel_for(ai.NumWorkloads(), [ai](int64_t idx) { + // char* -> scalar_t* needs reinterpret_cast + *reinterpret_cast(ai.GetOutputPtr(idx)) = + *reinterpret_cast( + ai.GetInputPtr(idx)); + }).wait_and_throw(); + }); + } +} + +void IndexSetSYCL(const Tensor& src, + Tensor& dst, + const std::vector& index_tensors, + const SizeVector& indexed_shape, + const SizeVector& indexed_strides) { + Dtype dtype = src.GetDtype(); + AdvancedIndexer ai(src, dst, index_tensors, indexed_shape, indexed_strides, + AdvancedIndexer::AdvancedIndexerMode::SET); + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice()); + if (dtype.IsObject()) { + int64_t object_byte_size = dtype.ByteSize(); + for (int64_t idx = 0; idx < ai.NumWorkloads(); ++idx) { + queue.memcpy(ai.GetOutputPtr(idx), ai.GetInputPtr(idx), + object_byte_size); + } + } else { + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dtype, [&]() { + queue.parallel_for(ai.NumWorkloads(), [ai](int64_t idx) { + // char* -> scalar_t* needs reinterpret_cast + *reinterpret_cast(ai.GetOutputPtr(idx)) = + *reinterpret_cast( + ai.GetInputPtr(idx)); + }).wait_and_throw(); + }); + } +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/IndexReduction.cpp b/cpp/open3d/core/kernel/IndexReduction.cpp index 19265d36620..e9c9df3f388 100644 --- a/cpp/open3d/core/kernel/IndexReduction.cpp +++ b/cpp/open3d/core/kernel/IndexReduction.cpp @@ -35,6 +35,10 @@ void IndexAdd_(int64_t dim, if (dst.IsCPU()) { IndexAddCPU_(dim, index, src_permute, dst_permute); + } else if (dst.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + IndexAddSYCL_(dim, index, src_permute, dst_permute); +#endif } else if (dst.IsCUDA()) { #ifdef BUILD_CUDA_MODULE IndexAddCUDA_(dim, index, src_permute, dst_permute); diff --git a/cpp/open3d/core/kernel/IndexReduction.h b/cpp/open3d/core/kernel/IndexReduction.h index f4fd3516b41..2b4e122f3f0 100644 --- a/cpp/open3d/core/kernel/IndexReduction.h +++ b/cpp/open3d/core/kernel/IndexReduction.h @@ -24,6 +24,13 @@ void IndexAddCPU_(int64_t dim, const Tensor& src, Tensor& dst); +#ifdef BUILD_SYCL_MODULE +void IndexAddSYCL_(int64_t dim, + const Tensor& index, + const Tensor& src, + Tensor& dst); +#endif + #ifdef BUILD_CUDA_MODULE void IndexAddCUDA_(int64_t dim, const Tensor& index, diff --git a/cpp/open3d/core/kernel/IndexReductionSYCL.cpp b/cpp/open3d/core/kernel/IndexReductionSYCL.cpp new file mode 100644 index 00000000000..47da284dc93 --- /dev/null +++ b/cpp/open3d/core/kernel/IndexReductionSYCL.cpp @@ -0,0 +1,61 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Dispatch.h" +#include "open3d/core/Indexer.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +namespace kernel { + +void IndexAddSYCL_(int64_t dim, + const Tensor& index, + const Tensor& src, + Tensor& dst) { + // index: [N,], src: [N, D], dst: [M, D] + // In Indexer, output shape defines the actual primary strides. + // However, in IndexAdd_, input dominates the iterations. + // So put dst (output) at indexer's input, and src (input) at output. + Indexer indexer({dst}, src, DtypePolicy::NONE); + + // Index is simply a 1D contiguous tensor, with a different stride + // behavior to src. So use raw pointer for simplicity. + auto index_ptr = index.GetDataPtr(); + + int64_t broadcasting_elems = 1; + for (int64_t d = 1; d < src.NumDims(); ++d) { + broadcasting_elems *= src.GetShape(d); + } + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice()); + + // TODO: Replace with SYCL reduction API + DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() { + queue.parallel_for(index.GetLength(), [=](int64_t workload_idx) { + int64_t reduction_idx = workload_idx / broadcasting_elems; + int64_t broadcasting_idx = workload_idx % broadcasting_elems; + + const int64_t idx = index_ptr[reduction_idx]; + int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx; + + // Note input and output is switched here to adapt to the + // indexer + scalar_t* src_ptr = indexer.GetOutputPtr(0, idx); + scalar_t* dst_ptr = indexer.GetInputPtr(0, dst_idx); + sycl::atomic_ref(*dst_ptr) += + *src_ptr; + }).wait_and_throw(); + }); +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/NonZero.cpp b/cpp/open3d/core/kernel/NonZero.cpp index 6d04f94e40a..686ce9ea885 100644 --- a/cpp/open3d/core/kernel/NonZero.cpp +++ b/cpp/open3d/core/kernel/NonZero.cpp @@ -18,6 +18,12 @@ namespace kernel { Tensor NonZero(const Tensor& src) { if (src.IsCPU()) { return NonZeroCPU(src); + } else if (src.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + return NonZeroSYCL(src); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else if (src.IsCUDA()) { #ifdef BUILD_CUDA_MODULE return NonZeroCUDA(src); diff --git a/cpp/open3d/core/kernel/NonZero.h b/cpp/open3d/core/kernel/NonZero.h index ab926ddd8c3..36ac8b4f5a0 100644 --- a/cpp/open3d/core/kernel/NonZero.h +++ b/cpp/open3d/core/kernel/NonZero.h @@ -17,6 +17,10 @@ Tensor NonZero(const Tensor& src); Tensor NonZeroCPU(const Tensor& src); +#ifdef BUILD_SYCL_MODULE +Tensor NonZeroSYCL(const Tensor& src); +#endif + #ifdef BUILD_CUDA_MODULE Tensor NonZeroCUDA(const Tensor& src); #endif diff --git a/cpp/open3d/core/kernel/NonZeroSYCL.cpp b/cpp/open3d/core/kernel/NonZeroSYCL.cpp new file mode 100644 index 00000000000..887f297c9b3 --- /dev/null +++ b/cpp/open3d/core/kernel/NonZeroSYCL.cpp @@ -0,0 +1,76 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include +#include +#include + +#include "open3d/core/Indexer.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/kernel/NonZero.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +namespace kernel { + +Tensor NonZeroSYCL(const Tensor& src) { + // Get flattened non-zero indices. + TensorIterator src_iter(src); + const int64_t num_elements = src.NumElements(); + auto device = src.GetDevice(); + Tensor indices = Tensor::Arange(0, num_elements, 1, core::Int64, device); + Tensor non_zero_indices(SizeVector({num_elements}), Int64, device); + int64_t *non_zero_indices_ptr = non_zero_indices.GetDataPtr(), + *indices_ptr = indices.GetDataPtr(); + size_t num_non_zeros; + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src.GetDtype(), [&]() { + auto it = std::copy_if( + oneapi::dpl::execution::dpcpp_default, indices_ptr, + indices_ptr + num_elements, non_zero_indices_ptr, + [src_iter](int64_t index) { + auto src_ptr = static_cast( + src_iter.GetPtr(index)); + OPEN3D_ASSERT(src_ptr != nullptr && "Internal error."); + return *src_ptr != 0; + }); + num_non_zeros = std::distance(non_zero_indices_ptr, it); + }); + + // Transform flattened indices to indices in each dimension. + const auto num_dims = src.NumDims(); + SizeVector shape = src.GetShape(); + // MAX_DIMS: Maximum number of dimensions of TensorRef, defined in + // Indexer.h. + sycl::marray shape_vec; // device copyable + if (shape.size() > MAX_DIMS) { + utility::LogError("Too many dimensions: {} > MAX_DIMS={}.", + shape.size(), MAX_DIMS); + } + for (auto k = 0; k < num_dims; ++k) shape_vec[k] = shape[k]; + Tensor result({num_dims, static_cast(num_non_zeros)}, Int64, + device); + int64_t* result_ptr = result.GetDataPtr(); + auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + + queue.parallel_for(num_non_zeros, [=](int64_t i) { + auto non_zero_index = non_zero_indices_ptr[i]; + auto this_result_ptr = + result_ptr + i + (num_dims - 1) * num_non_zeros; + OPEN3D_ASSERT(this_result_ptr != nullptr && "Internal error."); + for (auto dim = num_dims - 1; dim >= 0; + dim--, this_result_ptr -= num_non_zeros) { + *this_result_ptr = non_zero_index % shape_vec[dim]; + non_zero_index = non_zero_index / shape_vec[dim]; + } + }).wait_and_throw(); + return result; +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/Reduction.cpp b/cpp/open3d/core/kernel/Reduction.cpp index a94087d609a..1c022f27dc8 100644 --- a/cpp/open3d/core/kernel/Reduction.cpp +++ b/cpp/open3d/core/kernel/Reduction.cpp @@ -37,6 +37,10 @@ void Reduction(const Tensor& src, dims); } } + if (src.NumElements() == 0) { + utility::LogError( + "Zero-size Tensor does not support Arg Reductions."); + } } SizeVector keepdim_shape = @@ -71,6 +75,12 @@ void Reduction(const Tensor& src, if (src.IsCPU()) { ReductionCPU(src, dst, dims, keepdim, op_code); + } else if (src.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ReductionSYCL(src, dst, dims, keepdim, op_code); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else if (src.IsCUDA()) { #ifdef BUILD_CUDA_MODULE ReductionCUDA(src, dst, dims, keepdim, op_code); diff --git a/cpp/open3d/core/kernel/Reduction.h b/cpp/open3d/core/kernel/Reduction.h index a1a4b74e3a3..ec3bec62ab7 100644 --- a/cpp/open3d/core/kernel/Reduction.h +++ b/cpp/open3d/core/kernel/Reduction.h @@ -59,6 +59,14 @@ void ReductionCPU(const Tensor& src, bool keepdim, ReductionOpCode op_code); +#ifdef BUILD_SYCL_MODULE +void ReductionSYCL(const Tensor& src, + Tensor& dst, + const SizeVector& dims, + bool keepdim, + ReductionOpCode op_code); +#endif + #ifdef BUILD_CUDA_MODULE void ReductionCUDA(const Tensor& src, Tensor& dst, diff --git a/cpp/open3d/core/kernel/ReductionSYCL.cpp b/cpp/open3d/core/kernel/ReductionSYCL.cpp new file mode 100644 index 00000000000..8c93da87729 --- /dev/null +++ b/cpp/open3d/core/kernel/ReductionSYCL.cpp @@ -0,0 +1,320 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "open3d/core/Dispatch.h" +#include "open3d/core/Indexer.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/kernel/Reduction.h" +#include "open3d/utility/Logging.h" +#include "open3d/utility/Parallel.h" + +namespace open3d { +namespace core { +namespace kernel { + +namespace { + +template +struct ArgMinReduction { + using basic_reduction = sycl::minimum; + std::pair operator()(int64_t a_idx, + scalar_t a_val, + int64_t b_idx, + scalar_t b_val) const { + return a_val < b_val ? std::make_pair(a_idx, a_val) + : std::make_pair(b_idx, b_val); + } +}; + +template +struct ArgMaxReduction { + using basic_reduction = sycl::maximum; + std::pair operator()(int64_t a_idx, + scalar_t a_val, + int64_t b_idx, + scalar_t b_val) const { + return a_val > b_val ? std::make_pair(a_idx, a_val) + : std::make_pair(b_idx, b_val); + } +}; + +// TODO: This launches one kernel per output element, which can be inefficient +// in cases where the reduction dim is small but the non-reduced dim is large. +// Unit tests for a large number of outputs are disabled. +// Speed-up by launching one kernel for the entire reduction. +template +void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) { + auto device_props = + sy::SYCLContext::GetInstance().GetDeviceProperties(device); + auto queue = device_props.queue; + auto work_group_size = device_props.max_work_group_size; + size_t log2elements_per_group = 13; + auto elements_per_group = (1 << log2elements_per_group); // 8192 + size_t log2workitems_per_group = 8; + auto workitems_per_group = (1 << log2workitems_per_group); // 256 + auto elements_per_work_item = + elements_per_group / workitems_per_group; // 32 (= max SIMD size) + auto mask = ~(~0 << log2workitems_per_group); + ReductionOp red_op; + + for (int64_t output_idx = 0; output_idx < indexer.NumOutputElements(); + output_idx++) { + // sub_indexer.NumWorkloads() == ipo. + // sub_indexer's workload_idx is indexer's ipo_idx. + Indexer scalar_out_indexer = indexer.GetPerOutputIndexer(output_idx); + auto num_elements = scalar_out_indexer.NumWorkloads(); + auto num_work_groups = num_elements / elements_per_group; + if (num_elements > elements_per_group * num_work_groups) + ++num_work_groups; + // ensure each work group has work_group_size work items + auto num_work_items = num_work_groups * work_group_size; + + auto red_cg = [&](auto& cgh) { + auto output = scalar_out_indexer.GetOutputPtr(0); + // Setting this still doesn't initialize to identity - + // output buffer must be initialized separately. + auto sycl_reducer = sycl::reduction( + output, identity, red_op, + {sycl::property::reduction::initialize_to_identity()}); + cgh.parallel_for( + sycl::nd_range<1>{num_work_items, work_group_size}, + sycl_reducer, [=](sycl::nd_item<1> item, auto& red_arg) { + auto glob_id = item.get_global_id(0); + auto offset = ((glob_id >> log2workitems_per_group) + << log2elements_per_group) + + (glob_id & mask); + auto item_out = identity; + for (size_t i = 0; i < elements_per_work_item; i++) { + size_t idx = + (i << log2workitems_per_group) + offset; + if (idx >= num_elements) break; + auto val = + *scalar_out_indexer.GetInputPtr( + 0, idx); + item_out = red_op(item_out, val); + } + red_arg.combine(item_out); + }); + }; + + auto e = queue.submit(red_cg); + } + queue.wait_and_throw(); +} + +// Based on OneAPI GPU optimization guide code sample (Blocked access to +// input data + SYCL builtin reduction ops for final reduction) +// TODO: This launches one kernel per output element, which can be inefficient +// in cases where the reduction dim is small but the non-reduced dim is large. +// Speed-up by launching one kernel for the entire reduction. +template +void SYCLArgReductionEngine(Device device, Indexer indexer, scalar_t identity) { + auto device_props = + sy::SYCLContext::GetInstance().GetDeviceProperties(device); + auto queue = device_props.queue; + auto work_group_size = device_props.max_work_group_size; + size_t log2elements_per_group = 13; + auto elements_per_group = (1 << log2elements_per_group); // 8192 + size_t log2workitems_per_group = 8; + auto workitems_per_group = (1 << log2workitems_per_group); // 256 + auto elements_per_work_item = + elements_per_group / workitems_per_group; // 32 (= max SIMD size) + auto mask = ~(~0 << log2workitems_per_group); + ReductionOp red_op; + + // atomic flag. Must be 4 bytes. + sycl::buffer output_in_use{indexer.NumOutputElements()}; + auto e_fill = queue.submit([&](auto& cgh) { + auto acc_output_in_use = + output_in_use.get_access(cgh); + cgh.fill(acc_output_in_use, 0); + }); + + for (int64_t output_idx = 0; output_idx < indexer.NumOutputElements(); + output_idx++) { + // sub_indexer.NumWorkloads() == ipo. + // sub_indexer's workload_idx is indexer's ipo_idx. + Indexer scalar_out_indexer = indexer.GetPerOutputIndexer(output_idx); + auto num_elements = scalar_out_indexer.NumWorkloads(); + auto num_work_groups = num_elements / elements_per_group; + if (num_elements > elements_per_group * num_work_groups) + ++num_work_groups; + // ensure each work group has work_group_size work items + auto num_work_items = num_work_groups * work_group_size; + + sycl::buffer this_output_in_use{output_in_use, output_idx, + 1}; + auto arg_red_cg = [&](auto& cgh) { + auto acc_in_use = + this_output_in_use + .get_access(cgh); + cgh.parallel_for( + sycl::nd_range<1>{num_work_items, work_group_size}, + [=](sycl::nd_item<1> item) { + auto& out_idx = + *scalar_out_indexer.GetOutputPtr(0, 0); + auto& out_val = + *scalar_out_indexer.GetOutputPtr(1, + 0); + auto glob_id = item.get_global_id(0); + auto this_group = item.get_group(); + auto offset = ((glob_id >> log2workitems_per_group) + << log2elements_per_group) + + (glob_id & mask); + int64_t it_idx = 0; + scalar_t it_val = identity; + for (size_t i = 0; i < elements_per_work_item; i++) { + size_t idx = + (i << log2workitems_per_group) + offset; + if (idx >= num_elements) break; + auto val = + *scalar_out_indexer.GetInputPtr( + 0, idx); + std::tie(it_idx, it_val) = + red_op(it_idx, it_val, idx, val); + } + auto group_out_val = sycl::reduce_over_group( + this_group, it_val, identity, + typename ReductionOp::basic_reduction()); + // atomic (serial) reduction over all groups. SYCL does + // not have a barrier over groups. Work item(s) with min + // / max value update the output. (non-deterministic) + if (it_val == group_out_val) { + // TODO: Look for a better option to a spinlock + // mutex. + auto in_use = sycl::atomic_ref< + int32_t, sycl::memory_order::acq_rel, + sycl::memory_scope::device>(acc_in_use[0]); + while (in_use.exchange(1) == 1) { + } + std::tie(out_idx, out_val) = red_op( + out_idx, out_val, it_idx, group_out_val); + in_use.store(0); + } + }); + }; + + auto e = queue.submit(arg_red_cg); + } + queue.wait_and_throw(); +} +} // namespace + +void ReductionSYCL(const Tensor& src, + Tensor& dst, + const SizeVector& dims, + bool keepdim, + ReductionOpCode op_code) { + Device device = src.GetDevice(); + if (s_regular_reduce_ops.find(op_code) != s_regular_reduce_ops.end()) { + Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims); + DISPATCH_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() { + scalar_t identity; + switch (op_code) { + case ReductionOpCode::Sum: + dst.Fill(0); + SYCLReductionEngine, scalar_t>( + device, indexer, 0); + break; + case ReductionOpCode::Prod: + dst.Fill(1); + SYCLReductionEngine, scalar_t>( + device, indexer, 1); + break; + case ReductionOpCode::Min: + if (indexer.NumWorkloads() == 0) { + utility::LogError( + "Zero-size Tensor does not support Min."); + } else { + identity = std::numeric_limits::max(); + dst.Fill(identity); + SYCLReductionEngine, scalar_t>( + device, indexer, identity); + } + break; + case ReductionOpCode::Max: + if (indexer.NumWorkloads() == 0) { + utility::LogError( + "Zero-size Tensor does not support Max."); + } else { + identity = std::numeric_limits::lowest(); + dst.Fill(identity); + SYCLReductionEngine, scalar_t>( + device, indexer, identity); + } + break; + default: + utility::LogError("Unsupported op code."); + break; + } + }); + } else if (s_arg_reduce_ops.find(op_code) != s_arg_reduce_ops.end()) { + if (dst.GetDtype() != core::Int64) { + utility::LogError("Arg-reduction must have int64 output dtype."); + } + // Accumulation buffer to store temporary min/max values. + Tensor dst_acc(dst.GetShape(), src.GetDtype(), src.GetDevice()); + Indexer indexer({src}, {dst, dst_acc}, DtypePolicy::INPUT_SAME, dims); + DISPATCH_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() { + scalar_t identity; + switch (op_code) { + case ReductionOpCode::ArgMin: + identity = std::numeric_limits::max(); + dst_acc.Fill(identity); + SYCLArgReductionEngine, scalar_t>( + device, indexer, identity); + break; + case ReductionOpCode::ArgMax: + identity = std::numeric_limits::lowest(); + dst_acc.Fill(identity); + SYCLArgReductionEngine, scalar_t>( + device, indexer, identity); + break; + default: + utility::LogError("Unsupported op code."); + break; + } + }); + } else if (s_boolean_reduce_ops.find(op_code) != + s_boolean_reduce_ops.end()) { + if (src.GetDtype() != core::Bool) { + utility::LogError( + "Boolean reduction only supports boolean input tensor."); + } + if (dst.GetDtype() != core::Bool) { + utility::LogError( + "Boolean reduction only supports boolean output tensor."); + } + Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims); + switch (op_code) { + case ReductionOpCode::All: + // Identity == true. 0-sized tensor, returns true. + dst.Fill(true); + SYCLReductionEngine, bool>( + device, indexer, true); + break; + case ReductionOpCode::Any: + // Identity == false. 0-sized tensor, returns false. + dst.Fill(false); + SYCLReductionEngine, bool>( + device, indexer, false); + break; + default: + utility::LogError("Unsupported op code."); + break; + } + } else { + utility::LogError("Unsupported op code."); + } +} + +} // namespace kernel +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/kernel/UnaryEW.cpp b/cpp/open3d/core/kernel/UnaryEW.cpp index 911b2885e34..2513d41205d 100644 --- a/cpp/open3d/core/kernel/UnaryEW.cpp +++ b/cpp/open3d/core/kernel/UnaryEW.cpp @@ -22,6 +22,19 @@ void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { src.GetShape(), dst.GetShape()); } + // Check dtype compatibility + const auto float_only_ops = {UnaryEWOpCode::Sqrt, UnaryEWOpCode::Sin, + UnaryEWOpCode::Cos, UnaryEWOpCode::Exp, + UnaryEWOpCode::IsNan, UnaryEWOpCode::IsInf, + UnaryEWOpCode::IsFinite}; + Dtype src_dtype = src.GetDtype(); + if (std::find(float_only_ops.begin(), float_only_ops.end(), op_code) != + float_only_ops.end() && + src_dtype != core::Float32 && src_dtype != core::Float64) { + utility::LogError("Only supports Float32 and Float64, but {} is used.", + src_dtype.ToString()); + } + // Dispatch to device Device src_device = src.GetDevice(); Device dst_device = dst.GetDevice(); @@ -32,6 +45,12 @@ void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { if (src_device.IsCPU()) { UnaryEWCPU(src, dst, op_code); + } else if (src_device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + UnaryEWSYCL(src, dst, op_code); +#else + utility::LogError("Not compiled with SYCL, but SYCL device is used."); +#endif } else if (src_device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE UnaryEWCUDA(src, dst, op_code); @@ -73,6 +92,8 @@ void Copy(const Tensor& src, Tensor& dst) { #else utility::LogError("Not compiled with SYCL, but SYCL device is used."); #endif + } else { + utility::LogError("Copy: SYCL <-> CUDA Unimplemented device"); } } diff --git a/cpp/open3d/core/kernel/UnaryEW.h b/cpp/open3d/core/kernel/UnaryEW.h index 907b3371167..131dff08b91 100644 --- a/cpp/open3d/core/kernel/UnaryEW.h +++ b/cpp/open3d/core/kernel/UnaryEW.h @@ -32,15 +32,18 @@ enum class UnaryEWOpCode { }; void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code); - void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code); +#ifdef BUILD_SYCL_MODULE +void UnaryEWSYCL(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code); +#endif + #ifdef BUILD_CUDA_MODULE void UnaryEWCUDA(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code); #endif -// Copy is separated from other unary ops since it support cross-device copy and -// dtype casting. +// Copy is separated from other unary ops since it supports cross-device copy +// and dtype casting. void Copy(const Tensor& src, Tensor& dst); void CopyCPU(const Tensor& src, Tensor& dst); diff --git a/cpp/open3d/core/kernel/UnaryEWCPU.cpp b/cpp/open3d/core/kernel/UnaryEWCPU.cpp index 1a502e42a1d..f4673200b24 100644 --- a/cpp/open3d/core/kernel/UnaryEWCPU.cpp +++ b/cpp/open3d/core/kernel/UnaryEWCPU.cpp @@ -218,14 +218,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { Dtype src_dtype = src.GetDtype(); Dtype dst_dtype = dst.GetDtype(); - auto assert_dtype_is_float = [](Dtype dtype) -> void { - if (dtype != core::Float32 && dtype != core::Float64) { - utility::LogError( - "Only supports Float32 and Float64, but {} is used.", - dtype.ToString()); - } - }; - if (op_code == UnaryEWOpCode::LogicalNot) { if (dst_dtype == src_dtype) { Indexer indexer({src}, dst, DtypePolicy::ALL_SAME); @@ -259,7 +251,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { } else if (op_code == UnaryEWOpCode::IsNan || op_code == UnaryEWOpCode::IsInf || op_code == UnaryEWOpCode::IsFinite) { - assert_dtype_is_float(src_dtype); Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL); #ifdef BUILD_ISPC_MODULE ispc::Indexer ispc_indexer = indexer.ToISPC(); @@ -291,7 +282,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() { switch (op_code) { case UnaryEWOpCode::Sqrt: - assert_dtype_is_float(src_dtype); LaunchUnaryEWKernel( indexer, CPUSqrtElementKernel, OPEN3D_TEMPLATE_VECTORIZED(scalar_t, @@ -299,7 +289,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { &ispc_indexer)); break; case UnaryEWOpCode::Sin: - assert_dtype_is_float(src_dtype); LaunchUnaryEWKernel( indexer, CPUSinElementKernel, OPEN3D_TEMPLATE_VECTORIZED(scalar_t, @@ -307,7 +296,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { &ispc_indexer)); break; case UnaryEWOpCode::Cos: - assert_dtype_is_float(src_dtype); LaunchUnaryEWKernel( indexer, CPUCosElementKernel, OPEN3D_TEMPLATE_VECTORIZED(scalar_t, @@ -322,7 +310,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { &ispc_indexer)); break; case UnaryEWOpCode::Exp: - assert_dtype_is_float(src_dtype); LaunchUnaryEWKernel( indexer, CPUExpElementKernel, OPEN3D_TEMPLATE_VECTORIZED(scalar_t, diff --git a/cpp/open3d/core/kernel/UnaryEWSYCL.cpp b/cpp/open3d/core/kernel/UnaryEWSYCL.cpp index fed05af7b47..af99df1c91e 100644 --- a/cpp/open3d/core/kernel/UnaryEWSYCL.cpp +++ b/cpp/open3d/core/kernel/UnaryEWSYCL.cpp @@ -8,8 +8,12 @@ #include #include +#include "open3d/core/Dispatch.h" #include "open3d/core/Dtype.h" +#include "open3d/core/Indexer.h" #include "open3d/core/MemoryManager.h" +#include "open3d/core/ParallelFor.h" +#include "open3d/core/ParallelForSYCL.h" #include "open3d/core/SizeVector.h" #include "open3d/core/Tensor.h" #include "open3d/core/kernel/UnaryEW.h" @@ -19,32 +23,265 @@ namespace open3d { namespace core { namespace kernel { +namespace { + +struct UnaryElementKernel { + UnaryElementKernel(Indexer indexer_) : indexer(indexer_) {} + void operator()(int64_t i) {} + +protected: + Indexer indexer; +}; + +template +struct CopyElementKernel : public UnaryElementKernel { + using UnaryElementKernel::UnaryElementKernel; + void operator()(int64_t i) { + const src_t* src = indexer.GetInputPtr(0, i); + dst_t* dst = indexer.GetOutputPtr(i); + *dst = static_cast(*src); + } +}; + +// Math: integers treated as double (C++11) +// no casting needed for float +#define UNARY_ELEMENT_KERNEL(name, elem_op) \ + template \ + struct name##ElementKernel : public UnaryElementKernel { \ + using UnaryElementKernel::UnaryElementKernel; \ + void operator()(int64_t i) { \ + const src_t* src = indexer.GetInputPtr(0, i); \ + src_t* dst = indexer.GetOutputPtr(i); \ + *dst = static_cast(elem_op(static_cast(*src))); \ + } \ + }; \ + template <> \ + struct name##ElementKernel : public UnaryElementKernel { \ + using UnaryElementKernel::UnaryElementKernel; \ + void operator()(int64_t i) { \ + const float* src = indexer.GetInputPtr(0, i); \ + float* dst = indexer.GetOutputPtr(i); \ + *dst = elem_op(*src); \ + } \ + } + +UNARY_ELEMENT_KERNEL(Sqrt, sycl::sqrt); +UNARY_ELEMENT_KERNEL(Sin, sycl::sin); +UNARY_ELEMENT_KERNEL(Cos, sycl::cos); +UNARY_ELEMENT_KERNEL(Exp, sycl::exp); +// TODO: Use sycl::abs for integers (no casting) +UNARY_ELEMENT_KERNEL(Abs, sycl::fabs); +UNARY_ELEMENT_KERNEL(Floor, sycl::floor); +UNARY_ELEMENT_KERNEL(Ceil, sycl::ceil); +UNARY_ELEMENT_KERNEL(Round, sycl::round); +UNARY_ELEMENT_KERNEL(Trunc, sycl::trunc); +#undef UNARY_ELEMENT_KERNEL + +// No special treatment for unsigned types - we use the SYCL runtime +// default +template +struct NegElementKernel : public UnaryElementKernel { + using UnaryElementKernel::UnaryElementKernel; + void operator()(int64_t i) { + const scalar_t* src = indexer.GetInputPtr(0, i); + scalar_t* dst = indexer.GetOutputPtr(i); + *dst = -*src; + } +}; + +// Float checkers: integers treated as double (C++11) +// no casting needed for float +#define UNARY_ELEMENT_KERNEL(name, elem_op) \ + template \ + struct name##ElementKernel : public UnaryElementKernel { \ + using UnaryElementKernel::UnaryElementKernel; \ + void operator()(int64_t i) { \ + const src_t* src = indexer.GetInputPtr(0, i); \ + bool* dst = indexer.GetOutputPtr(i); \ + *dst = elem_op(static_cast(*src)); \ + } \ + }; \ + template <> \ + struct name##ElementKernel : public UnaryElementKernel { \ + using UnaryElementKernel::UnaryElementKernel; \ + void operator()(int64_t i) { \ + const float* src = indexer.GetInputPtr(0, i); \ + bool* dst = indexer.GetOutputPtr(i); \ + *dst = elem_op(*src); \ + } \ + } + +UNARY_ELEMENT_KERNEL(IsNan, sycl::isnan); +UNARY_ELEMENT_KERNEL(IsInf, sycl::isinf); +UNARY_ELEMENT_KERNEL(IsFinite, sycl::isfinite); +#undef UNARY_ELEMENT_KERNEL + +template +struct LogicalNotElementKernel : public UnaryElementKernel { + using UnaryElementKernel::UnaryElementKernel; + void operator()(int64_t i) { + const src_t* src = indexer.GetInputPtr(0, i); + dst_t* dst = indexer.GetOutputPtr(i); + *dst = static_cast(!static_cast(*src)); + } +}; +} // namespace + void CopySYCL(const Tensor& src, Tensor& dst) { - // It has been checked that - // - at least one of src or dst is SYCL device + // src and dst have been checked to have the same shape + // at least one of src and dst is SYCL and the other is SYCL or CPU SizeVector shape = src.GetShape(); + Dtype src_dtype = src.GetDtype(), dst_dtype = dst.GetDtype(); + Device src_device = src.GetDevice(), dst_device = dst.GetDevice(); + Device device_with_queue = dst.IsSYCL() ? dst.GetDevice() : src.GetDevice(); + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(device_with_queue); + + if (src_device.IsSYCL() && dst_device.IsSYCL()) { + if (src.IsContiguous() && dst.IsContiguous() && + src.GetShape() == dst.GetShape() && src_dtype == dst_dtype) { + MemoryManager::Memcpy(dst.GetDataPtr(), dst.GetDevice(), + src.GetDataPtr(), src.GetDevice(), + src_dtype.ByteSize() * shape.NumElements()); + } else if (dst.NumElements() > 1 && dst.IsContiguous() && + src.NumElements() == 1 && !src_dtype.IsObject()) { + int64_t num_elements = dst.NumElements(); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() { + scalar_t scalar_element = src.To(dst_dtype).Item(); + scalar_t* dst_ptr = dst.GetDataPtr(); + queue.fill(dst_ptr, scalar_element, num_elements) + .wait_and_throw(); + }); + } else if (src_device == dst_device) { // non-contiguous or broadcast + // on same SYCL device + Indexer indexer({src}, dst, DtypePolicy::NONE); + if (src.GetDtype().IsObject()) { + // TODO: This is likely very slow. Coalesce into less memcpy + // calls. + int64_t object_byte_size = src.GetDtype().ByteSize(); + for (int64_t i = 0; i < indexer.NumWorkloads(); ++i) { + const void* src_ptr = indexer.GetInputPtr(0, i); + void* dst_ptr = indexer.GetOutputPtr(i); + queue.memcpy(dst_ptr, src_ptr, object_byte_size); + } + queue.wait_and_throw(); + } else { + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + using src_t = scalar_t; + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() { + using dst_t = scalar_t; + ParallelForSYCL>( + device_with_queue, indexer); + }); + }); + } + } else { + dst.CopyFrom(src.Contiguous().To(dst_device)); + } + } else if (src_device.IsCPU() && dst_device.IsSYCL() || + src_device.IsSYCL() && dst_device.IsCPU()) { + Tensor src_conti = src.Contiguous(); // No op if already contiguous + if (dst.IsContiguous() && src.GetShape() == dst.GetShape() && + src_dtype == dst_dtype) { + MemoryManager::Memcpy(dst.GetDataPtr(), dst_device, + src_conti.GetDataPtr(), src_conti.GetDevice(), + src_dtype.ByteSize() * shape.NumElements()); + } else { + dst.CopyFrom(src.Contiguous().To(dst_device)); + } + } else { + utility::LogError("Wrong device type {} -> {}", src_device.ToString(), + dst_device.ToString()); + } +} + +void UnaryEWSYCL(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) { + // src and dst have been changed to have the same shape, device Dtype src_dtype = src.GetDtype(); Dtype dst_dtype = dst.GetDtype(); - Device dst_device = dst.GetDevice(); - Device src_device = src.GetDevice(); + Device device = src.GetDevice(); // == dst.GetDevice() - if (src_dtype != dst_dtype) { - utility::LogError( - "CopySYCL: Dtype conversion from src to dst not implemented!"); - } - if ((dst_device.IsSYCL() && !dst.IsContiguous()) || - (src_device.IsSYCL() && !src.IsContiguous())) { - utility::LogError( - "CopySYCL: NonContiguous SYCL tensor Copy not implemented!"); - } - Tensor src_conti = src.Contiguous(); // No op if already contiguous - if (dst.IsContiguous() && src.GetShape() == dst.GetShape() && - src_dtype == dst_dtype) { - MemoryManager::Memcpy(dst.GetDataPtr(), dst_device, - src_conti.GetDataPtr(), src_conti.GetDevice(), - src_dtype.ByteSize() * shape.NumElements()); + if (op_code == UnaryEWOpCode::LogicalNot) { + if (dst_dtype == src_dtype) { + Indexer indexer({src}, dst, DtypePolicy::ALL_SAME); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + ParallelForSYCL>( + device, indexer); + }); + } else if (dst_dtype == Bool) { + Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL); + DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() { + ParallelForSYCL>( + device, indexer); + }); + } else { + utility::LogError( + "Boolean op's output type must be boolean or the " + "same type as the input."); + } + } else if (op_code == UnaryEWOpCode::IsNan || + op_code == UnaryEWOpCode::IsInf || + op_code == UnaryEWOpCode::IsFinite) { + Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL); + DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() { + if (op_code == UnaryEWOpCode::IsNan) { + ParallelForSYCL>(device, indexer); + } else if (op_code == UnaryEWOpCode::IsInf) { + ParallelForSYCL>(device, indexer); + } else if (op_code == UnaryEWOpCode::IsFinite) { + ParallelForSYCL>(device, + indexer); + } + }); } else { - dst.CopyFrom(src_conti.To(dst_device)); + Indexer indexer({src}, dst, DtypePolicy::ALL_SAME); + DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() { + switch (op_code) { + case UnaryEWOpCode::Sqrt: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Sin: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Cos: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Neg: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Exp: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Abs: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Floor: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Ceil: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Round: + ParallelForSYCL>(device, + indexer); + break; + case UnaryEWOpCode::Trunc: + ParallelForSYCL>(device, + indexer); + break; + default: + utility::LogError("Unimplemented op_code for UnaryEWSYCL"); + break; + } + }); } } diff --git a/cpp/open3d/core/linalg/AddMM.cpp b/cpp/open3d/core/linalg/AddMM.cpp index aea908cecbb..45cfd71df7d 100644 --- a/cpp/open3d/core/linalg/AddMM.cpp +++ b/cpp/open3d/core/linalg/AddMM.cpp @@ -98,6 +98,13 @@ void AddMM(const Tensor& A, ldb, lda, ldc, dtype, device); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + AddMMSYCL(B_data, A_data, C_data, n, k, m, alpha, beta, transB, transA, + ldb, lda, ldc, dtype, device); +#else + utility::LogError("Unimplemented device."); #endif } else { AddMMCPU(B_data, A_data, C_data, n, k, m, alpha, beta, transB, transA, diff --git a/cpp/open3d/core/linalg/AddMM.h b/cpp/open3d/core/linalg/AddMM.h index 1754e430fef..6d26703ae1b 100644 --- a/cpp/open3d/core/linalg/AddMM.h +++ b/cpp/open3d/core/linalg/AddMM.h @@ -20,6 +20,24 @@ namespace core { void AddMM( const Tensor& A, const Tensor& B, Tensor& C, double alpha, double beta); +#ifdef BUILD_SYCL_MODULE +void AddMMSYCL(void* A_data, + void* B_data, + void* C_data, + int64_t m, + int64_t k, + int64_t n, + double alpha, + double beta, + bool gemmTrA, + bool gemmTrB, + int lda, + int ldb, + int ldc, + Dtype dtype, + const Device& device); +#endif + #ifdef BUILD_CUDA_MODULE void AddMMCUDA(void* A_data, void* B_data, diff --git a/cpp/open3d/core/linalg/AddMMSYCL.cpp b/cpp/open3d/core/linalg/AddMMSYCL.cpp new file mode 100644 index 00000000000..591a4f26f00 --- /dev/null +++ b/cpp/open3d/core/linalg/AddMMSYCL.cpp @@ -0,0 +1,49 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/AddMM.h" +#include "open3d/core/linalg/LinalgUtils.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { + +void AddMMSYCL(void* A_data, + void* B_data, + void* C_data, + int64_t m, + int64_t k, + int64_t n, + double alpha, + double beta, + bool gemmTrA, + bool gemmTrB, + int lda, + int ldb, + int ldc, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + blas::column_major::gemm(queue, gemmTrA ? transpose::T : transpose::N, + gemmTrB ? transpose::T : transpose::N, m, n, k, + static_cast(alpha), + static_cast(A_data), lda, + static_cast(B_data), ldb, + static_cast(beta), + static_cast(C_data), ldc) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/Inverse.cpp b/cpp/open3d/core/linalg/Inverse.cpp index 51cf7693217..07cb507c1ca 100644 --- a/cpp/open3d/core/linalg/Inverse.cpp +++ b/cpp/open3d/core/linalg/Inverse.cpp @@ -55,6 +55,20 @@ void Inverse(const Tensor &A, Tensor &output) { output = output.T(); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + Tensor ipiv = Tensor::Empty({n}, core::Int64, device); + void *ipiv_data = ipiv.GetDataPtr(); + + // LAPACKE supports getri, A is in-place modified as output. + Tensor A_T = A.T().To(device, /*copy=*/true); + void *A_data = A_T.GetDataPtr(); + + InverseSYCL(A_data, ipiv_data, nullptr, n, dtype, device); + output = A_T.T(); +#else + utility::LogError("Unimplemented device."); #endif } else { Dtype ipiv_dtype; diff --git a/cpp/open3d/core/linalg/Inverse.h b/cpp/open3d/core/linalg/Inverse.h index 8809f8f176c..2d3cbb2bbcf 100644 --- a/cpp/open3d/core/linalg/Inverse.h +++ b/cpp/open3d/core/linalg/Inverse.h @@ -17,11 +17,20 @@ void Inverse(const Tensor& A, Tensor& output); void InverseCPU(void* A_data, void* ipiv_data, - void* output_data, + [[maybe_unused]] void* output_data, int64_t n, Dtype dtype, const Device& device); +#ifdef BUILD_SYCL_MODULE +void InverseSYCL(void* A_data, + void* ipiv_data, + [[maybe_unused]] void* output_data, + int64_t n, + Dtype dtype, + const Device& device); +#endif + #ifdef BUILD_CUDA_MODULE void InverseCUDA(void* A_data, void* ipiv_data, diff --git a/cpp/open3d/core/linalg/InverseSYCL.cpp b/cpp/open3d/core/linalg/InverseSYCL.cpp new file mode 100644 index 00000000000..f66dd089cda --- /dev/null +++ b/cpp/open3d/core/linalg/InverseSYCL.cpp @@ -0,0 +1,48 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/Blob.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/Inverse.h" +#include "open3d/core/linalg/LinalgUtils.h" + +namespace open3d { +namespace core { + +void InverseSYCL(void* A_data, + void* ipiv_data, + void* output_data, + int64_t n, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + int64_t lda = n; + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + // Use blob to ensure cleanup of scratchpad memory. + int64_t scratchpad_size = std::max( + lapack::getrf_scratchpad_size(queue, n, n, lda), + lapack::getri_scratchpad_size(queue, n, lda)); + core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device); + auto lu_done = + lapack::getrf(queue, n, n, static_cast(A_data), lda, + static_cast(ipiv_data), + static_cast(scratchpad.GetDataPtr()), + scratchpad_size); + lapack::getri(queue, n, static_cast(A_data), lda, + static_cast(ipiv_data), + static_cast(scratchpad.GetDataPtr()), + scratchpad_size, {lu_done}) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/LU.cpp b/cpp/open3d/core/linalg/LU.cpp index 5bfafdac425..3a9a4e4788c 100644 --- a/cpp/open3d/core/linalg/LU.cpp +++ b/cpp/open3d/core/linalg/LU.cpp @@ -16,9 +16,9 @@ namespace open3d { namespace core { // Get column permutation tensor from ipiv (swapping index array). -static core::Tensor GetColPermutation(const Tensor& ipiv, - int number_of_indices, - int number_of_rows) { +static Tensor GetColPermutation(const Tensor& ipiv, + int number_of_indices, + int number_of_rows) { Tensor full_ipiv = Tensor::Arange(0, number_of_rows, 1, core::Int32, Device("CPU:0")); Tensor ipiv_cpu = ipiv.To(Device("CPU:0"), core::Int32, /*copy=*/false); @@ -42,14 +42,14 @@ static void OutputToPLU(const Tensor& output, const Tensor& ipiv, const bool permute_l) { int n = output.GetShape()[0]; - core::Device device = output.GetDevice(); + Device device = output.GetDevice(); // Get upper and lower matrix from output matrix. Triul(output, upper, lower, 0); // Get column permutation vector from pivot indices vector. Tensor col_permutation = GetColPermutation(ipiv, ipiv.GetShape()[0], n); // Creating "Permutation Matrix (P in P.A = L.U)". - permutation = core::Tensor::Eye(n, output.GetDtype(), device) + permutation = Tensor::Eye(n, output.GetDtype(), device) .IndexGet({col_permutation}); // Calculating P in A = P.L.U. [P.Inverse() = P.T()]. permutation = permutation.T().Contiguous(); @@ -88,15 +88,23 @@ void LUIpiv(const Tensor& A, Tensor& ipiv, Tensor& output) { // elements as U, (diagonal elements of L are unity), and ipiv array, // which has the pivot indices (for 1 <= i <= min(M,N), row i of the // matrix was interchanged with row IPIV(i). + int64_t ipiv_len = std::min(rows, cols); if (device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE CUDAScopedDevice scoped_device(device); - int64_t ipiv_len = std::min(rows, cols); - ipiv = core::Tensor::Empty({ipiv_len}, core::Int32, device); + ipiv = Tensor::Empty({ipiv_len}, core::Int32, device); void* ipiv_data = ipiv.GetDataPtr(); LUCUDA(A_data, ipiv_data, rows, cols, dtype, device); #else utility::LogInfo("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + ipiv = Tensor::Empty({ipiv_len}, core::Int64, device); + void* ipiv_data = ipiv.GetDataPtr(); + LUSYCL(A_data, ipiv_data, rows, cols, dtype, device); +#else + utility::LogInfo("Unimplemented device."); #endif } else { Dtype ipiv_dtype; @@ -107,9 +115,7 @@ void LUIpiv(const Tensor& A, Tensor& ipiv, Tensor& output) { } else { utility::LogError("Unsupported OPEN3D_CPU_LINALG_INT type."); } - - int64_t ipiv_len = std::min(rows, cols); - ipiv = core::Tensor::Empty({ipiv_len}, ipiv_dtype, device); + ipiv = Tensor::Empty({ipiv_len}, ipiv_dtype, device); void* ipiv_data = ipiv.GetDataPtr(); LUCPU(A_data, ipiv_data, rows, cols, dtype, device); } @@ -125,7 +131,7 @@ void LU(const Tensor& A, AssertTensorDtypes(A, {Float32, Float64}); // Get output matrix and ipiv. - core::Tensor ipiv, output; + Tensor ipiv, output; LUIpiv(A, ipiv, output); // Decompose output in P, L, U matrix form. diff --git a/cpp/open3d/core/linalg/LUImpl.h b/cpp/open3d/core/linalg/LUImpl.h index 36898fa3de9..fde8f9df385 100644 --- a/cpp/open3d/core/linalg/LUImpl.h +++ b/cpp/open3d/core/linalg/LUImpl.h @@ -21,6 +21,15 @@ void LUCPU(void* A_data, Dtype dtype, const Device& device); +#ifdef BUILD_SYCL_MODULE +void LUSYCL(void* A_data, + void* ipiv_data, + int64_t rows, + int64_t cols, + Dtype dtype, + const Device& device); +#endif + #ifdef BUILD_CUDA_MODULE void LUCUDA(void* A_data, void* ipiv_data, diff --git a/cpp/open3d/core/linalg/LUSYCL.cpp b/cpp/open3d/core/linalg/LUSYCL.cpp new file mode 100644 index 00000000000..9368270c685 --- /dev/null +++ b/cpp/open3d/core/linalg/LUSYCL.cpp @@ -0,0 +1,42 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/Blob.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/LUImpl.h" +#include "open3d/core/linalg/LinalgUtils.h" + +namespace open3d { +namespace core { + +void LUSYCL(void* A_data, + void* ipiv_data, + int64_t m, + int64_t n, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + int64_t lda = m; + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + // Use blob to ensure cleanup of scratchpad memory. + int64_t scratchpad_size = + lapack::getrf_scratchpad_size(queue, m, n, lda); + core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device); + lapack::getrf(queue, m, n, static_cast(A_data), lda, + static_cast(ipiv_data), + static_cast(scratchpad.GetDataPtr()), + scratchpad_size) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/LeastSquares.cpp b/cpp/open3d/core/linalg/LeastSquares.cpp index 6340331c5ce..46c520215e1 100644 --- a/cpp/open3d/core/linalg/LeastSquares.cpp +++ b/cpp/open3d/core/linalg/LeastSquares.cpp @@ -63,6 +63,12 @@ void LeastSquares(const Tensor &A, const Tensor &B, Tensor &X) { LeastSquaresCUDA(A_data, B_data, m, n, k, dtype, device); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + LeastSquaresSYCL(A_data, B_data, m, n, k, dtype, device); +#else + utility::LogError("Unimplemented device."); #endif } else { LeastSquaresCPU(A_data, B_data, m, n, k, dtype, device); diff --git a/cpp/open3d/core/linalg/LeastSquares.h b/cpp/open3d/core/linalg/LeastSquares.h index c2e79935f5d..aedc33a5a39 100644 --- a/cpp/open3d/core/linalg/LeastSquares.h +++ b/cpp/open3d/core/linalg/LeastSquares.h @@ -25,6 +25,16 @@ void LeastSquaresCUDA(void* A_data, const Device& device); #endif +#ifdef BUILD_SYCL_MODULE +void LeastSquaresSYCL(void* A_data, + void* B_data, + int64_t m, + int64_t n, + int64_t k, + Dtype dtype, + const Device& device); +#endif + void LeastSquaresCPU(void* A_data, void* B_data, int64_t m, diff --git a/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp b/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp new file mode 100644 index 00000000000..faa9dfb3307 --- /dev/null +++ b/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp @@ -0,0 +1,46 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/Blob.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/LeastSquares.h" +#include "open3d/core/linalg/LinalgUtils.h" + +namespace open3d { +namespace core { + +void LeastSquaresSYCL(void* A_data, + void* B_data, + int64_t m, + int64_t n, + int64_t k, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + int nrhs = k, lda = m, stride_a = lda * n, ldb = std::max(m, n), + stride_b = ldb * nrhs, batch_size = 1; + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + // Use blob to ensure cleanup of scratchpad memory. + int64_t scratchpad_size = lapack::gels_batch_scratchpad_size( + queue, transpose::N, m, n, nrhs, lda, stride_a, ldb, stride_b, + batch_size); + core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device); + lapack::gels_batch( + queue, transpose::N, m, n, nrhs, static_cast(A_data), + lda, stride_a, static_cast(B_data), ldb, stride_b, + batch_size, static_cast(scratchpad.GetDataPtr()), + scratchpad_size) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/Matmul.cpp b/cpp/open3d/core/linalg/Matmul.cpp index 1b616c5f335..012a116b736 100644 --- a/cpp/open3d/core/linalg/Matmul.cpp +++ b/cpp/open3d/core/linalg/Matmul.cpp @@ -65,7 +65,13 @@ void Matmul(const Tensor& A, const Tensor& B, Tensor& output) { output = Tensor::Empty({m, n}, dtype, device); void* C_data = output.GetDataPtr(); - if (device.IsCUDA()) { + if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + MatmulSYCL(B_data, A_data, C_data, n, k, m, dtype, device); +#else + utility::LogError("Unimplemented device."); +#endif + } else if (device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE CUDAScopedDevice scoped_device(device); MatmulCUDA(B_data, A_data, C_data, n, k, m, dtype, device); diff --git a/cpp/open3d/core/linalg/Matmul.h b/cpp/open3d/core/linalg/Matmul.h index da29240b0c4..eeaba408591 100644 --- a/cpp/open3d/core/linalg/Matmul.h +++ b/cpp/open3d/core/linalg/Matmul.h @@ -15,6 +15,16 @@ namespace core { /// Computes matrix multiplication C = AB. void Matmul(const Tensor& A, const Tensor& B, Tensor& C); +#ifdef BUILD_SYCL_MODULE +void MatmulSYCL(void* A_data, + void* B_data, + void* C_data, + int64_t m, + int64_t k, + int64_t n, + Dtype dtype, + const Device& device); +#endif #ifdef BUILD_CUDA_MODULE void MatmulCUDA(void* A_data, void* B_data, diff --git a/cpp/open3d/core/linalg/MatmulSYCL.cpp b/cpp/open3d/core/linalg/MatmulSYCL.cpp new file mode 100644 index 00000000000..be2d6f94ac7 --- /dev/null +++ b/cpp/open3d/core/linalg/MatmulSYCL.cpp @@ -0,0 +1,39 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/AddMM.h" +#include "open3d/core/linalg/LinalgUtils.h" +#include "open3d/utility/Logging.h" + +namespace open3d { +namespace core { +void MatmulSYCL(void* A_data, + void* B_data, + void* C_data, + int64_t m, + int64_t k, + int64_t n, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + scalar_t alpha = 1, beta = 0; + blas::column_major::gemm(queue, transpose::N, transpose::N, m, n, k, + alpha, static_cast(A_data), m, + static_cast(B_data), k, beta, + static_cast(C_data), m) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/SVD.cpp b/cpp/open3d/core/linalg/SVD.cpp index 935d014ef4e..657e79f9c57 100644 --- a/cpp/open3d/core/linalg/SVD.cpp +++ b/cpp/open3d/core/linalg/SVD.cpp @@ -40,23 +40,31 @@ void SVD(const Tensor &A, Tensor &U, Tensor &S, Tensor &VT) { U = Tensor::Empty({m, m}, dtype, device); S = Tensor::Empty({n}, dtype, device); VT = Tensor::Empty({n, n}, dtype, device); - Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device); void *A_data = A_T.GetDataPtr(); void *U_data = U.GetDataPtr(); void *S_data = S.GetDataPtr(); void *VT_data = VT.GetDataPtr(); - void *superb_data = superb.GetDataPtr(); - if (device.IsCUDA()) { + if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + SVDSYCL(A_data, U_data, S_data, VT_data, m, n, dtype, device); +#else + utility::LogError("Unimplemented device."); +#endif + } else if (device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE CUDAScopedDevice scoped_device(device); + Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device); + void *superb_data = superb.GetDataPtr(); SVDCUDA(A_data, U_data, S_data, VT_data, superb_data, m, n, dtype, device); #else utility::LogError("Unimplemented device."); #endif } else { + Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device); + void *superb_data = superb.GetDataPtr(); SVDCPU(A_data, U_data, S_data, VT_data, superb_data, m, n, dtype, device); } diff --git a/cpp/open3d/core/linalg/SVD.h b/cpp/open3d/core/linalg/SVD.h index 9e6deefa6c1..51d775d7a9d 100644 --- a/cpp/open3d/core/linalg/SVD.h +++ b/cpp/open3d/core/linalg/SVD.h @@ -16,6 +16,17 @@ namespace core { /// is a min(m, n), VT is an n x n tensor. void SVD(const Tensor& A, Tensor& U, Tensor& S, Tensor& VT); +#ifdef BUILD_SYCL_MODULE +void SVDSYCL(const void* A_data, + void* U_data, + void* S_data, + void* VT_data, + int64_t m, + int64_t n, + Dtype dtype, + const Device& device); +#endif + #ifdef BUILD_CUDA_MODULE void SVDCUDA(const void* A_data, void* U_data, diff --git a/cpp/open3d/core/linalg/SVDSYCL.cpp b/cpp/open3d/core/linalg/SVDSYCL.cpp new file mode 100644 index 00000000000..27abf31728f --- /dev/null +++ b/cpp/open3d/core/linalg/SVDSYCL.cpp @@ -0,0 +1,48 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/Blob.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/LinalgUtils.h" +#include "open3d/core/linalg/SVD.h" + +namespace open3d { +namespace core { + +void SVDSYCL(const void* A_data, + void* U_data, + void* S_data, + void* VT_data, + int64_t m, + int64_t n, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + int64_t lda = m, ldvt = n, ldu = m; + int64_t scratchpad_size = lapack::gesvd_scratchpad_size( + queue, jobsvd::vectors, jobsvd::vectors, m, n, lda, ldu, ldvt); + // Use blob to ensure cleanup of scratchpad memory. + Blob scratchpad(scratchpad_size * sizeof(scalar_t), device); + lapack::gesvd( + queue, jobsvd::vectors, jobsvd::vectors, m, n, + const_cast(static_cast(A_data)), + lda, static_cast(S_data), + static_cast(U_data), ldu, + static_cast(VT_data), ldvt, + static_cast(scratchpad.GetDataPtr()), + scratchpad_size) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/Solve.cpp b/cpp/open3d/core/linalg/Solve.cpp index f1025520fac..ec09e8be3f1 100644 --- a/cpp/open3d/core/linalg/Solve.cpp +++ b/cpp/open3d/core/linalg/Solve.cpp @@ -60,7 +60,16 @@ void Solve(const Tensor &A, const Tensor &B, Tensor &X) { X = B.T().Clone(); void *B_data = X.GetDataPtr(); - if (device.IsCUDA()) { + if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + Tensor ipiv = Tensor::Empty({n}, core::Int64, device); + void *ipiv_data = ipiv.GetDataPtr(); + + SolveSYCL(A_data, B_data, ipiv_data, n, k, dtype, device); +#else + utility::LogError("Unimplemented device."); +#endif + } else if (device.IsCUDA()) { #ifdef BUILD_CUDA_MODULE CUDAScopedDevice scoped_device(device); Tensor ipiv = Tensor::Empty({n}, core::Int32, device); diff --git a/cpp/open3d/core/linalg/Solve.h b/cpp/open3d/core/linalg/Solve.h index 485de7ef0f2..a299c100ca7 100644 --- a/cpp/open3d/core/linalg/Solve.h +++ b/cpp/open3d/core/linalg/Solve.h @@ -23,6 +23,16 @@ void SolveCPU(void* A_data, Dtype dtype, const Device& device); +#ifdef BUILD_SYCL_MODULE +void SolveSYCL(void* A_data, + void* B_data, + void* ipiv_data, + int64_t n, + int64_t k, + Dtype dtype, + const Device& device); +#endif + #ifdef BUILD_CUDA_MODULE void SolveCUDA(void* A_data, void* B_data, diff --git a/cpp/open3d/core/linalg/SolveSYCL.cpp b/cpp/open3d/core/linalg/SolveSYCL.cpp new file mode 100644 index 00000000000..38d6690ff89 --- /dev/null +++ b/cpp/open3d/core/linalg/SolveSYCL.cpp @@ -0,0 +1,44 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include + +#include "oneapi/mkl.hpp" +#include "open3d/core/Blob.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/linalg/LinalgUtils.h" +#include "open3d/core/linalg/Solve.h" + +namespace open3d { +namespace core { + +void SolveSYCL(void* A_data, + void* B_data, + void* ipiv_data, + int64_t n, + int64_t k, + Dtype dtype, + const Device& device) { + using namespace oneapi::mkl; + sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device); + int64_t nrhs = k, lda = n, ldb = n; + DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() { + int64_t scratchpad_size = lapack::gesv_scratchpad_size( + queue, n, nrhs, lda, ldb); + // Use blob to ensure cleanup of scratchpad memory. + Blob scratchpad(scratchpad_size * sizeof(scalar_t), device); + lapack::gesv(queue, n, nrhs, static_cast(A_data), lda, + static_cast(ipiv_data), + static_cast(B_data), ldb, + static_cast(scratchpad.GetDataPtr()), + scratchpad_size) + .wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/linalg/Tri.cpp b/cpp/open3d/core/linalg/Tri.cpp index 77e72b076a2..b15b864aa6a 100644 --- a/cpp/open3d/core/linalg/Tri.cpp +++ b/cpp/open3d/core/linalg/Tri.cpp @@ -42,6 +42,12 @@ void Triu(const Tensor& A, Tensor& output, const int diagonal) { TriuCUDA(A.Contiguous(), output, diagonal); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + TriuSYCL(A.Contiguous(), output, diagonal); +#else + utility::LogError("Unimplemented device."); #endif } else { TriuCPU(A.Contiguous(), output, diagonal); @@ -58,6 +64,12 @@ void Tril(const Tensor& A, Tensor& output, const int diagonal) { TrilCUDA(A.Contiguous(), output, diagonal); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + TrilSYCL(A.Contiguous(), output, diagonal); +#else + utility::LogError("Unimplemented device."); #endif } else { TrilCPU(A.Contiguous(), output, diagonal); @@ -75,6 +87,12 @@ void Triul(const Tensor& A, Tensor& upper, Tensor& lower, const int diagonal) { TriulCUDA(A.Contiguous(), upper, lower, diagonal); #else utility::LogError("Unimplemented device."); +#endif + } else if (device.IsSYCL()) { +#ifdef BUILD_SYCL_MODULE + TriulSYCL(A.Contiguous(), upper, lower, diagonal); +#else + utility::LogError("Unimplemented device."); #endif } else { TriulCPU(A.Contiguous(), upper, lower, diagonal); diff --git a/cpp/open3d/core/linalg/TriImpl.h b/cpp/open3d/core/linalg/TriImpl.h index 441d9c69c93..db7b204a87c 100644 --- a/cpp/open3d/core/linalg/TriImpl.h +++ b/cpp/open3d/core/linalg/TriImpl.h @@ -22,6 +22,17 @@ void TriulCPU(const Tensor& A, Tensor& lower, const int diagonal = 0); +#ifdef BUILD_SYCL_MODULE +void TriuSYCL(const Tensor& A, Tensor& output, const int diagonal = 0); + +void TrilSYCL(const Tensor& A, Tensor& output, const int diagonal = 0); + +void TriulSYCL(const Tensor& A, + Tensor& upper, + Tensor& lower, + const int diagonal = 0); +#endif + #ifdef BUILD_CUDA_MODULE void TriuCUDA(const Tensor& A, Tensor& output, const int diagonal = 0); diff --git a/cpp/open3d/core/linalg/TriSYCL.cpp b/cpp/open3d/core/linalg/TriSYCL.cpp new file mode 100644 index 00000000000..3d10f99efc3 --- /dev/null +++ b/cpp/open3d/core/linalg/TriSYCL.cpp @@ -0,0 +1,82 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/Dispatch.h" +#include "open3d/core/SYCLContext.h" +#include "open3d/core/Tensor.h" +#include "open3d/core/linalg/TriImpl.h" + +namespace open3d { +namespace core { + +void TriuSYCL(const Tensor &A, Tensor &output, const int diagonal) { + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice()); + DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() { + const scalar_t *A_ptr = static_cast(A.GetDataPtr()); + scalar_t *output_ptr = static_cast(output.GetDataPtr()); + auto rows = static_cast(A.GetShape()[0]), + cols = static_cast(A.GetShape()[1]); + queue.parallel_for({cols, rows}, [=](auto wid) { + const auto wid_1d = wid[1] * cols + wid[0]; + if (static_cast(wid[0]) - static_cast(wid[1]) >= + diagonal) { + output_ptr[wid_1d] = A_ptr[wid_1d]; + } + }).wait_and_throw(); + }); +} + +void TrilSYCL(const Tensor &A, Tensor &output, const int diagonal) { + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice()); + DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() { + const scalar_t *A_ptr = static_cast(A.GetDataPtr()); + scalar_t *output_ptr = static_cast(output.GetDataPtr()); + auto rows = static_cast(A.GetShape()[0]), + cols = static_cast(A.GetShape()[1]); + queue.parallel_for({cols, rows}, [=](auto wid) { + const auto wid_1d = wid[1] * cols + wid[0]; + if (static_cast(wid[0]) - static_cast(wid[1]) <= + diagonal) { + output_ptr[wid_1d] = A_ptr[wid_1d]; + } + }).wait_and_throw(); + }); +} + +void TriulSYCL(const Tensor &A, + Tensor &upper, + Tensor &lower, + const int diagonal) { + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice()); + DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() { + const scalar_t *A_ptr = static_cast(A.GetDataPtr()); + scalar_t *upper_ptr = static_cast(upper.GetDataPtr()); + scalar_t *lower_ptr = static_cast(lower.GetDataPtr()); + auto rows = static_cast(A.GetShape()[0]), + cols = static_cast(A.GetShape()[1]); + queue.parallel_for({cols, rows}, [=](auto wid) { + const auto wid_1d = wid[1] * cols + wid[0]; + if (static_cast(wid[0]) - static_cast(wid[1]) < + diagonal) { + lower_ptr[wid_1d] = A_ptr[wid_1d]; + } else if (static_cast(wid[0]) - + static_cast(wid[1]) > + diagonal) { + upper_ptr[wid_1d] = A_ptr[wid_1d]; + } else { + lower_ptr[wid_1d] = 1; + upper_ptr[wid_1d] = A_ptr[wid_1d]; + } + }).wait_and_throw(); + }); +} + +} // namespace core +} // namespace open3d diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.h b/cpp/open3d/core/nns/NearestNeighborSearch.h index 98898cdd7b2..2996f8a6caa 100644 --- a/cpp/open3d/core/nns/NearestNeighborSearch.h +++ b/cpp/open3d/core/nns/NearestNeighborSearch.h @@ -27,12 +27,14 @@ class NearestNeighborSearch { /// Constructor. /// /// \param dataset_points Dataset points for constructing search index. Must - /// be 2D, with shape {n, d}. + /// be 2D, with shape {n, d}. SYCL tensors are not yet supported. // NearestNeighborSearch(const Tensor &dataset_points) // : dataset_points_(dataset_points){}; NearestNeighborSearch(const Tensor &dataset_points, const Dtype &index_dtype = core::Int32) - : dataset_points_(dataset_points), index_dtype_(index_dtype){}; + : dataset_points_(dataset_points), index_dtype_(index_dtype) { + AssertNotSYCL(dataset_points_); + }; ~NearestNeighborSearch(); NearestNeighborSearch(const NearestNeighborSearch &) = delete; NearestNeighborSearch &operator=(const NearestNeighborSearch &) = delete; diff --git a/cpp/open3d/t/geometry/TriangleMesh.cpp b/cpp/open3d/t/geometry/TriangleMesh.cpp index 17461b7214c..938f7333d49 100644 --- a/cpp/open3d/t/geometry/TriangleMesh.cpp +++ b/cpp/open3d/t/geometry/TriangleMesh.cpp @@ -1447,6 +1447,11 @@ Image TriangleMesh::ProjectImagesToAlbedo( const std::vector &extrinsic_matrices, int tex_size /*=1024*/, bool update_material /*=true*/) { + if (!GetDevice().IsCPU() || !Image::HAVE_IPP) { + utility::LogError( + "ProjectImagesToAlbedo is only supported on x86_64 CPU " + "devices."); + } using core::None; using tk = core::TensorKey; constexpr float EPS = 1e-6; @@ -1480,7 +1485,7 @@ Image TriangleMesh::ProjectImagesToAlbedo( tex_size, {"positions"}, 1, 0, false)["positions"]; core::Tensor albedo = core::Tensor::Zeros({tex_size, tex_size, 4}, core::Float32); - albedo.Slice(2, 3, 4).Fill(EPS); // regularize + albedo.Slice(2, 3, 4).Fill(EPS); // regularize weight std::mutex albedo_mutex; RaycastingScene rcs; @@ -1581,11 +1586,11 @@ Image TriangleMesh::ProjectImagesToAlbedo( // C. Interpolate weighted image to weighted texture // albedo[u,v] = image[ i[u,v], j[u,v] ] this_albedo[widx].Fill(0.f); - ipp::Remap(weighted_image[widx], /*{height, width, 4} f32*/ - uv2xy2[0], /* {texsz, texsz} f32*/ - uv2xy2[1], /* {texsz, texsz} f32*/ - this_albedo[widx], /*{texsz, texsz, 4} f32*/ - t::geometry::Image::InterpType::Linear); + IPP_CALL(ipp::Remap, weighted_image[widx], /*{height, width, 4} f32*/ + uv2xy2[0], /* {texsz, texsz} f32*/ + uv2xy2[1], /* {texsz, texsz} f32*/ + this_albedo[widx], /*{texsz, texsz, 4} f32*/ + t::geometry::Image::InterpType::Linear); // Weights can become negative with higher order interpolation std::unique_lock albedo_lock{albedo_mutex}; diff --git a/cpp/open3d/t/geometry/TriangleMesh.h b/cpp/open3d/t/geometry/TriangleMesh.h index b1bd980f8b6..a517a28c7c6 100644 --- a/cpp/open3d/t/geometry/TriangleMesh.h +++ b/cpp/open3d/t/geometry/TriangleMesh.h @@ -1006,6 +1006,8 @@ class TriangleMesh : public Geometry, public DrawableGeometry { /// albedo. For best results, use images captured with exposure and white /// balance lock to reduce the chance of seams in the output texture. /// + /// This function is only supported on the CPU device. + /// /// \param images vector of images. /// \param intrinsic_matrices vector of {3,3} intrinsic matrices describing /// the pinhole camera. diff --git a/cpp/open3d/t/geometry/kernel/CMakeLists.txt b/cpp/open3d/t/geometry/kernel/CMakeLists.txt index aa651596b6d..acc3ddc7e33 100644 --- a/cpp/open3d/t/geometry/kernel/CMakeLists.txt +++ b/cpp/open3d/t/geometry/kernel/CMakeLists.txt @@ -1,4 +1,6 @@ open3d_ispc_add_library(tgeometry_kernel OBJECT) +set_target_properties(tgeometry_kernel PROPERTIES CXX_VISIBILITY_PRESET "hidden") + target_sources(tgeometry_kernel PRIVATE Image.cpp diff --git a/cpp/open3d/t/geometry/kernel/IPPImage.cpp b/cpp/open3d/t/geometry/kernel/IPPImage.cpp index 1535673db30..5b6474757f2 100644 --- a/cpp/open3d/t/geometry/kernel/IPPImage.cpp +++ b/cpp/open3d/t/geometry/kernel/IPPImage.cpp @@ -28,7 +28,6 @@ #include "open3d/core/Dispatch.h" #include "open3d/core/Dtype.h" -#include "open3d/core/ParallelFor.h" #include "open3d/core/ShapeUtil.h" #include "open3d/core/Tensor.h" #include "open3d/t/geometry/Image.h" diff --git a/cpp/open3d/t/io/ImageIO.h b/cpp/open3d/t/io/ImageIO.h index 44a1a9d6a66..6b50af60eb5 100644 --- a/cpp/open3d/t/io/ImageIO.h +++ b/cpp/open3d/t/io/ImageIO.h @@ -47,7 +47,7 @@ constexpr int kOpen3DImageIODefaultQuality = -1; /// /// Supported file extensions are png, jpg/jpeg. Data type and number of /// channels depends on the file extension. -/// - PNG: Dtype should be one of core::UInt8, core::UInt16 +/// - PNG: Dtype should be one of core::Bool, core::UInt8, core::UInt16 /// Supported number of channels are 1, 3, and 4. /// - JPG: Dtyppe should be core::UInt8 /// Supported number of channels are 1 and 3. diff --git a/cpp/open3d/utility/CompilerInfo.cpp b/cpp/open3d/utility/CompilerInfo.cpp index ecd538ce92d..f55e4d666de 100644 --- a/cpp/open3d/utility/CompilerInfo.cpp +++ b/cpp/open3d/utility/CompilerInfo.cpp @@ -44,12 +44,19 @@ std::string CompilerInfo::CUDACompilerVersion() const { void CompilerInfo::Print() const { #ifdef BUILD_CUDA_MODULE - utility::LogInfo("CompilerInfo: C++ {}, {} {}, {} {}.", CXXStandard(), - CXXCompilerId(), CXXCompilerVersion(), CUDACompilerId(), - CUDACompilerVersion()); + utility::LogInfo("CompilerInfo: C++ {}, {} {}, {} {}, SYCL disabled.", + CXXStandard(), CXXCompilerId(), CXXCompilerVersion(), + CUDACompilerId(), CUDACompilerVersion()); #else - utility::LogInfo("CompilerInfo: C++ {}, {} {}, CUDA disabled.", - CXXStandard(), CXXCompilerId(), CXXCompilerVersion()); +#ifdef BUILD_SYCL_MODULE + utility::LogInfo( + "CompilerInfo: C++ {}, {} {}, CUDA disabled, SYCL enabled.", + CXXStandard(), CXXCompilerId(), CXXCompilerVersion()); +#else + utility::LogInfo( + "CompilerInfo: C++ {}, {} {}, CUDA disabled, SYCL disabled", + CXXStandard(), CXXCompilerId(), CXXCompilerVersion()); +#endif #endif } diff --git a/cpp/pybind/CMakeLists.txt b/cpp/pybind/CMakeLists.txt index 6efae9a17fd..5fdce155f18 100644 --- a/cpp/pybind/CMakeLists.txt +++ b/cpp/pybind/CMakeLists.txt @@ -233,6 +233,7 @@ add_custom_target(python-package -DOPEN3D_ML_ROOT=${OPEN3D_ML_ROOT} -DBUILD_GUI=${BUILD_GUI} -DBUILD_CUDA_MODULE=${BUILD_CUDA_MODULE} + -DBUILD_SYCL_MODULE=${BUILD_SYCL_MODULE} -DGUI_RESOURCE_DIR=${GUI_RESOURCE_DIR} -DPROJECT_EMAIL=${PROJECT_EMAIL} -DPROJECT_HOMEPAGE_URL=${PROJECT_HOMEPAGE_URL} diff --git a/cpp/pybind/core/sycl_utils.cpp b/cpp/pybind/core/sycl_utils.cpp index 9a28dcb5072..dce144e050f 100644 --- a/cpp/pybind/core/sycl_utils.cpp +++ b/cpp/pybind/core/sycl_utils.cpp @@ -33,6 +33,11 @@ void pybind_sycl_utils_definitions(py::module& m) { "Enables the JIT cache for SYCL. This sets an environment " "variable and " "will affect the entire process and any child processes."); + + m_sycl.def("get_device_type", sy::GetDeviceType, "device"_a, + "Returns the device type (cpu / gpu / accelerator / custom) of " + "the specified device as a string. Returns empty string if the " + "device is not available."); } } // namespace core diff --git a/cpp/pybind/make_python_package.cmake b/cpp/pybind/make_python_package.cmake index 01e0d5663f5..aa4171414b0 100644 --- a/cpp/pybind/make_python_package.cmake +++ b/cpp/pybind/make_python_package.cmake @@ -77,6 +77,7 @@ if (BUNDLE_OPEN3D_ML) file(RENAME "${PYTHON_PACKAGE_DST_DIR}/open3d/ml3d" "${PYTHON_PACKAGE_DST_DIR}/open3d/_ml3d") endif() +set(requirement_files ${PYTHON_PACKAGE_SRC_DIR}/requirements.txt) # Build Jupyter plugin. if (BUILD_JUPYTER_EXTENSION) if (WIN32 OR UNIX AND NOT LINUX_AARCH64) @@ -113,17 +114,19 @@ if (BUILD_JUPYTER_EXTENSION) "npm install -g yarn.") endif() - # Append requirements_jupyter_install.txt to requirements.txt - # These will be installed when `pip install open3d`. - execute_process(COMMAND ${CMAKE_COMMAND} -E cat - ${PYTHON_PACKAGE_SRC_DIR}/requirements.txt - ${PYTHON_PACKAGE_SRC_DIR}/requirements_jupyter_install.txt - OUTPUT_VARIABLE ALL_REQUIREMENTS - ) - # The double-quote "" is important as it keeps the semicolons. - file(WRITE ${PYTHON_PACKAGE_DST_DIR}/requirements.txt "${ALL_REQUIREMENTS}") + list(APPEND requirement_files + ${PYTHON_PACKAGE_SRC_DIR}/requirements_jupyter_install.txt) +endif() + +if (BUILD_SYCL_MODULE) + list(APPEND requirement_files ${PYTHON_PACKAGE_SRC_DIR}/requirements_sycl.txt) endif() +# These will be installed when the user does `pip install open3d`. + execute_process(COMMAND ${CMAKE_COMMAND} -E cat ${requirement_files} + OUTPUT_FILE ${PYTHON_PACKAGE_DST_DIR}/requirements.txt + ) + if (BUILD_GUI) file(MAKE_DIRECTORY "${PYTHON_PACKAGE_DST_DIR}/open3d/resources/") file(COPY ${GUI_RESOURCE_DIR} diff --git a/cpp/pybind/t/geometry/trianglemesh.cpp b/cpp/pybind/t/geometry/trianglemesh.cpp index e5df69fc48a..b1fed27bc08 100644 --- a/cpp/pybind/t/geometry/trianglemesh.cpp +++ b/cpp/pybind/t/geometry/trianglemesh.cpp @@ -1072,6 +1072,8 @@ blended together in the resulting albedo. For best results, use images captured with exposure and white balance lock to reduce the chance of seams in the output texture. +This function is only supported on the CPU. + Args: images (List[open3d.t.geometry.Image]): List of images. intrinsic_matrices (List[open3d.core.Tensor]): List of (3,3) intrinsic matrices describing diff --git a/cpp/tests/core/Blob.cpp b/cpp/tests/core/Blob.cpp index 85be95d31cc..c55d537acb0 100644 --- a/cpp/tests/core/Blob.cpp +++ b/cpp/tests/core/Blob.cpp @@ -16,10 +16,9 @@ namespace open3d { namespace tests { class BlobPermuteDevices : public PermuteDevicesWithSYCL {}; -INSTANTIATE_TEST_SUITE_P( - Blob, - BlobPermuteDevices, - testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); +INSTANTIATE_TEST_SUITE_P(Blob, + BlobPermuteDevices, + testing::ValuesIn(BlobPermuteDevices::TestCases())); TEST_P(BlobPermuteDevices, BlobConstructor) { core::Device device = GetParam(); diff --git a/cpp/tests/core/CMakeLists.txt b/cpp/tests/core/CMakeLists.txt index c2839eacbae..a1c4826d0fa 100644 --- a/cpp/tests/core/CMakeLists.txt +++ b/cpp/tests/core/CMakeLists.txt @@ -36,3 +36,12 @@ if (BUILD_ISPC_MODULE) ParallelFor.ispc ) endif() + +# TODO: cmake does not currently build this test! +# if (BUILD_SYCL_MODULE) +# target_sources(tests PRIVATE +# ParallelForSYCL.cpp +# ) +# set_source_files_properties(ParallelForSYCL.cpp PROPERTIES +# COMPILE_OPTIONS "-fsycl;-fsycl-targets=spir64_gen") +# endif() \ No newline at end of file diff --git a/cpp/tests/core/CoreTest.cpp b/cpp/tests/core/CoreTest.cpp index 8f1486ce4c6..850d486656e 100644 --- a/cpp/tests/core/CoreTest.cpp +++ b/cpp/tests/core/CoreTest.cpp @@ -16,6 +16,13 @@ #include "open3d/core/SizeVector.h" namespace open3d { +namespace core { +void PrintTo(const Device &device, std::ostream *os) { + *os << device.ToString(); +} +void PrintTo(const Dtype &dtype, std::ostream *os) { *os << dtype.ToString(); } +} // namespace core + namespace tests { std::vector PermuteDtypesWithBool::TestCases() { @@ -45,7 +52,6 @@ std::vector PermuteDevices::TestCases() { devices.push_back(cuda_devices[0]); devices.push_back(cuda_devices[1]); } - return devices; } @@ -53,8 +59,12 @@ std::vector PermuteDevicesWithSYCL::TestCases() { std::vector devices = PermuteDevices::TestCases(); std::vector sycl_devices = core::Device::GetAvailableSYCLDevices(); - if (!sycl_devices.empty()) { + // Skip the last SYCL device - this is the CPU fallback and support is + // untested. + if (sycl_devices.size() > 1) { devices.push_back(sycl_devices[0]); + // devices.insert(devices.end(), sycl_devices.begin(), + // sycl_devices.end()); } return devices; } @@ -85,7 +95,6 @@ PermuteDevicePairs::TestCases() { } } } - return device_pairs; } @@ -123,7 +132,6 @@ PermuteDevicePairsWithSYCL::TestCases() { } } } - return device_pairs; } diff --git a/cpp/tests/core/CoreTest.h b/cpp/tests/core/CoreTest.h index ab874b8b485..f8c6f5db273 100644 --- a/cpp/tests/core/CoreTest.h +++ b/cpp/tests/core/CoreTest.h @@ -15,6 +15,10 @@ #include "tests/Tests.h" namespace open3d { +namespace core { +void PrintTo(const Device &device, std::ostream *os); +void PrintTo(const Dtype &dtype, std::ostream *os); +} // namespace core namespace tests { class PermuteDtypesWithBool : public testing::TestWithParam { diff --git a/cpp/tests/core/EigenConverter.cpp b/cpp/tests/core/EigenConverter.cpp index 7754657cc13..b3c7d085908 100644 --- a/cpp/tests/core/EigenConverter.cpp +++ b/cpp/tests/core/EigenConverter.cpp @@ -17,10 +17,11 @@ namespace open3d { namespace tests { -class EigenConverterPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(EigenConverter, - EigenConverterPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class EigenConverterPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + EigenConverter, + EigenConverterPermuteDevices, + testing::ValuesIn(EigenConverterPermuteDevices::TestCases())); TEST_P(EigenConverterPermuteDevices, TensorToEigenMatrix) { core::Device device = GetParam(); diff --git a/cpp/tests/core/Indexer.cpp b/cpp/tests/core/Indexer.cpp index 305c08693a0..50a62fee618 100644 --- a/cpp/tests/core/Indexer.cpp +++ b/cpp/tests/core/Indexer.cpp @@ -22,10 +22,10 @@ namespace open3d { namespace tests { -class IndexerPermuteDevices : public PermuteDevices {}; +class IndexerPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(Indexer, IndexerPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(IndexerPermuteDevices::TestCases())); TEST_P(IndexerPermuteDevices, TensorRef) { core::Device device = GetParam(); diff --git a/cpp/tests/core/Linalg.cpp b/cpp/tests/core/Linalg.cpp index 2c8be12323b..3bd9775e534 100644 --- a/cpp/tests/core/Linalg.cpp +++ b/cpp/tests/core/Linalg.cpp @@ -11,6 +11,7 @@ #include "open3d/core/AdvancedIndexing.h" #include "open3d/core/Dtype.h" #include "open3d/core/MemoryManager.h" +#include "open3d/core/SYCLUtils.h" #include "open3d/core/SizeVector.h" #include "open3d/core/Tensor.h" #include "open3d/core/kernel/Kernel.h" @@ -23,10 +24,10 @@ namespace open3d { namespace tests { -class LinalgPermuteDevices : public PermuteDevices {}; +class LinalgPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(Linalg, LinalgPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(LinalgPermuteDevices::TestCases())); TEST_P(LinalgPermuteDevices, Matmul) { const float EPSILON = 1e-8; @@ -482,6 +483,9 @@ TEST_P(LinalgPermuteDevices, LeastSquares) { const float EPSILON = 1e-5; core::Device device = GetParam(); + if (core::sy::GetDeviceType(device) == "cpu") { + GTEST_SKIP() << "MKL unsupported SYCL device."; + } core::Dtype dtype = core::Float32; // Solve test. diff --git a/cpp/tests/core/MemoryManager.cpp b/cpp/tests/core/MemoryManager.cpp index 9e3570a5af0..e4017da7e63 100644 --- a/cpp/tests/core/MemoryManager.cpp +++ b/cpp/tests/core/MemoryManager.cpp @@ -17,9 +17,10 @@ namespace open3d { namespace tests { class MemoryManagerPermuteDevices : public PermuteDevicesWithSYCL {}; -INSTANTIATE_TEST_SUITE_P(MemoryManager, - MemoryManagerPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +INSTANTIATE_TEST_SUITE_P( + MemoryManager, + MemoryManagerPermuteDevices, + testing::ValuesIn(MemoryManagerPermuteDevices::TestCases())); class MemoryManagerPermuteDevicePairs : public PermuteDevicePairsWithSYCL {}; INSTANTIATE_TEST_SUITE_P( diff --git a/cpp/tests/core/ParallelForSYCL.cpp b/cpp/tests/core/ParallelForSYCL.cpp new file mode 100644 index 00000000000..ff78804faea --- /dev/null +++ b/cpp/tests/core/ParallelForSYCL.cpp @@ -0,0 +1,65 @@ +// ---------------------------------------------------------------------------- +// - Open3D: www.open3d.org - +// ---------------------------------------------------------------------------- +// Copyright (c) 2018-2024 www.open3d.org +// SPDX-License-Identifier: MIT +// ---------------------------------------------------------------------------- + +#include "open3d/core/ParallelForSYCL.h" + +#include + +#include "open3d/Macro.h" +#include "open3d/core/Dispatch.h" +#include "open3d/core/Dtype.h" +#include "open3d/core/Tensor.h" +#include "tests/Tests.h" +#include "tests/core/CoreTest.h" + +struct TestIndexerFillKernel { + TestFillKernel(const core::Indexer &indexer_, int64_t multiplier_) + : indexer(indexer_), multiplier(multiplier_) {} + void operator()(int64_t idx) { + indexer.GetOutputPtr(0)[idx] = idx * multiplier; + } + +private: + core::Indexer indexer; + int64_t multiplier; +}; + +struct TestPtrFillKernel { + TestFillKernel(int64_t *out_, int64_t multiplier_) + : out(out_), multiplier(multiplier_) {} + void operator()(int64_t idx) { out[idx] = idx * multiplier; } + +private: + int64_t *out; + int64_t multiplier; +}; + +TEST(ParallelForSYCL, FunctorSYCL) { + const core::Device device("SYCL:0"); + const size_t N = 10000000; + core::Indexer indexer({}, tensor, DtypePolicy::NONE); + int64_t multiplier = 2; + + { + core::Tensor tensor({N, 1}, core::Int64, device); + core::ParallelForSYCL(device, indexer, + multiplier); + auto result = tensor.To(core::Device()).GetDataPtr(); + for (int64_t i = 0; i < tensor.NumElements(); ++i) { + ASSERT_EQ(result[i], i * multiplier); + } + } + { + core::Tensor tensor({N, 1}, core::Int64, device); + core::ParallelForSYCL( + device, N, tensor.GetDataPtr(), multiplier); + auto result = tensor.To(core::Device()).GetDataPtr(); + for (int64_t i = 0; i < tensor.NumElements(); ++i) { + ASSERT_EQ(result[i], i * multiplier); + } + } +} \ No newline at end of file diff --git a/cpp/tests/core/Tensor.cpp b/cpp/tests/core/Tensor.cpp index 9f788116be7..25d56d24aee 100644 --- a/cpp/tests/core/Tensor.cpp +++ b/cpp/tests/core/Tensor.cpp @@ -15,6 +15,7 @@ #include "open3d/core/AdvancedIndexing.h" #include "open3d/core/Dtype.h" #include "open3d/core/MemoryManager.h" +#include "open3d/core/SYCLUtils.h" #include "open3d/core/SizeVector.h" #include "open3d/core/kernel/Kernel.h" #include "open3d/utility/FileSystem.h" @@ -28,13 +29,13 @@ namespace tests { class TensorPermuteDevices : public PermuteDevices {}; INSTANTIATE_TEST_SUITE_P(Tensor, TensorPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(TensorPermuteDevices::TestCases())); -class TensorPermuteDevicesWithSYCL : public PermuteDevices {}; +class TensorPermuteDevicesWithSYCL : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P( Tensor, TensorPermuteDevicesWithSYCL, - testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); + testing::ValuesIn(TensorPermuteDevicesWithSYCL::TestCases())); class TensorPermuteDevicePairs : public PermuteDevicePairs {}; INSTANTIATE_TEST_SUITE_P( @@ -95,7 +96,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ConstructorBool) { EXPECT_EQ(t.GetDtype(), dtype); } -TEST_P(TensorPermuteDevices, WithInitValue) { +TEST_P(TensorPermuteDevicesWithSYCL, WithInitValue) { core::Device device = GetParam(); std::vector vals{0, 1, 2, 3, 4, 5}; @@ -223,7 +224,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, WithInitValueSizeMismatch) { std::runtime_error); } -TEST_P(TensorPermuteDevices, Arange) { +TEST_P(TensorPermuteDevicesWithSYCL, Arange) { core::Device device = GetParam(); core::Tensor arange; @@ -266,28 +267,28 @@ TEST_P(TensorPermuteDevices, Arange) { std::runtime_error); } -TEST_P(TensorPermuteDevices, Fill) { +TEST_P(TensorPermuteDevicesWithSYCL, Fill) { core::Device device = GetParam(); core::Tensor t(std::vector(2 * 3, 0), {2, 3}, core::Float32, device); t.Fill(1); EXPECT_EQ(t.ToFlatVector(), std::vector({1, 1, 1, 1, 1, 1})); } -TEST_P(TensorPermuteDevices, FillBool) { +TEST_P(TensorPermuteDevicesWithSYCL, FillBool) { core::Device device = GetParam(); core::Tensor t(std::vector(2 * 3, false), {2, 3}, core::Bool, device); t.Fill(true); EXPECT_EQ(t.ToFlatVector(), std::vector(2 * 3, true)); } -TEST_P(TensorPermuteDevices, FillSlice) { +TEST_P(TensorPermuteDevicesWithSYCL, FillSlice) { core::Device device = GetParam(); core::Tensor t(std::vector(2 * 3, 0), {2, 3}, core::Float32, device); t.Slice(1, 0, 3, 2).Fill(1); // t[:, 0:3:2].fill(1) EXPECT_EQ(t.ToFlatVector(), std::vector({1, 0, 1, 1, 0, 1})); } -TEST_P(TensorPermuteDevicePairs, IndexSetFillFancy) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSetFillFancy) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -348,7 +349,7 @@ TEST_P(TensorPermuteDevicePairsWithSYCL, CopyBool) { EXPECT_EQ(dst_t.ToFlatVector(), vals); } -TEST_P(TensorPermuteDevices, To) { +TEST_P(TensorPermuteDevicesWithSYCL, To) { core::Device device = GetParam(); core::SizeVector shape{2, 3}; @@ -364,7 +365,7 @@ TEST_P(TensorPermuteDevices, To) { EXPECT_EQ(dst_t.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevicePairs, ToDevice) { +TEST_P(TensorPermuteDevicePairsWithSYCL, ToDevice) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -382,7 +383,7 @@ TEST_P(TensorPermuteDevicePairs, ToDevice) { EXPECT_ANY_THROW(src_t.To(core::Device("CUDA:100000"))); } -TEST_P(TensorPermuteDevicePairs, CopyBroadcast) { +TEST_P(TensorPermuteDevicePairsWithSYCL, CopyBroadcast) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -403,7 +404,7 @@ TEST_P(TensorPermuteDevicePairs, CopyBroadcast) { EXPECT_EQ(dst_t.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, Expand) { +TEST_P(TensorPermuteDevicesWithSYCL, Expand) { core::Device device = GetParam(); core::Dtype dtype(core::Float32); @@ -423,7 +424,7 @@ TEST_P(TensorPermuteDevices, Expand) { EXPECT_EQ(dst_t.GetDataPtr(), src_t.GetDataPtr()); } -TEST_P(TensorPermuteDevices, Flatten) { +TEST_P(TensorPermuteDevicesWithSYCL, Flatten) { core::Device device = GetParam(); // Flatten 0-D Tensor. @@ -562,7 +563,7 @@ TEST_P(TensorPermuteSizesDefaultStridesAndDevices, DefaultStrides) { EXPECT_EQ(t.GetStrides(), expected_strides); } -TEST_P(TensorPermuteDevices, OperatorSquareBrackets) { +TEST_P(TensorPermuteDevicesWithSYCL, OperatorSquareBrackets) { core::Device device = GetParam(); // Zero dim @@ -623,7 +624,7 @@ TEST_P(TensorPermuteDevices, OperatorSquareBrackets) { EXPECT_EQ(t_1_2_3.GetBlob(), t.GetBlob()); } -TEST_P(TensorPermuteDevices, Item) { +TEST_P(TensorPermuteDevicesWithSYCL, Item) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -645,7 +646,7 @@ TEST_P(TensorPermuteDevices, Item) { EXPECT_EQ(t_1_2_3.Item(), 23.f); } -TEST_P(TensorPermuteDevices, ItemBool) { +TEST_P(TensorPermuteDevicesWithSYCL, ItemBool) { core::Device device = GetParam(); std::vector vals{true, true, false}; @@ -660,7 +661,7 @@ TEST_P(TensorPermuteDevices, ItemBool) { EXPECT_EQ(t[2].Item(), false); } -TEST_P(TensorPermuteDevices, ItemAssign) { +TEST_P(TensorPermuteDevicesWithSYCL, ItemAssign) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( {{{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}}, @@ -782,7 +783,7 @@ TEST_P(TensorPermuteDevicePairsWithSYCL, CopyContiguous) { t_1_copy.GetBlob()->GetDataPtr()); // Points to beginning of Blob } -TEST_P(TensorPermuteDevices, Slice) { +TEST_P(TensorPermuteDevicesWithSYCL, Slice) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -839,7 +840,7 @@ TEST_P(TensorPermuteDevices, Slice) { EXPECT_EQ(t_5.ToFlatVector(), std::vector({12, 16})); } -TEST_P(TensorPermuteDevices, GetItem) { +TEST_P(TensorPermuteDevicesWithSYCL, GetItem) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -858,7 +859,7 @@ TEST_P(TensorPermuteDevices, GetItem) { std::vector({12, 14, 16, 18, 20, 22})); } -TEST_P(TensorPermuteDevices, GetItemAdvancedIndexing) { +TEST_P(TensorPermuteDevicesWithSYCL, GetItemAdvancedIndexing) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, @@ -874,7 +875,7 @@ TEST_P(TensorPermuteDevices, GetItemAdvancedIndexing) { std::vector({0, 1, 1, 2, 3, 5, 8, 13, 21})); } -TEST_P(TensorPermuteDevices, GetItemAdvancedIndexingMixed) { +TEST_P(TensorPermuteDevicesWithSYCL, GetItemAdvancedIndexingMixed) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -894,7 +895,7 @@ TEST_P(TensorPermuteDevices, GetItemAdvancedIndexingMixed) { EXPECT_EQ(t_1.ToFlatVector(), std::vector({13, 17, 14, 18})); } -TEST_P(TensorPermuteDevices, SetItemAdvancedIndexing) { +TEST_P(TensorPermuteDevicesWithSYCL, SetItemAdvancedIndexing) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -914,7 +915,7 @@ TEST_P(TensorPermuteDevices, SetItemAdvancedIndexing) { 16, 17, 18, 19, 20, 21, 22, 23})); } -TEST_P(TensorPermuteDevices, SetItemAdvancedIndexingMixed) { +TEST_P(TensorPermuteDevicesWithSYCL, SetItemAdvancedIndexingMixed) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -937,7 +938,7 @@ TEST_P(TensorPermuteDevices, SetItemAdvancedIndexingMixed) { 16, 200, 400, 19, 20, 21, 22, 23})); } -TEST_P(TensorPermuteDevices, SliceAssign) { +TEST_P(TensorPermuteDevicesWithSYCL, SliceAssign) { core::Device device = GetParam(); core::Tensor dst = core::Tensor::Init( @@ -991,7 +992,7 @@ TEST_P(TensorPermuteDevices, SliceAssign) { 16, 17, 18, 19, 203, 21, 223, 23})); } -TEST_P(TensorPermuteDevices, Append) { +TEST_P(TensorPermuteDevicesWithSYCL, Append) { core::Device device = GetParam(); core::Tensor self, other, output; @@ -1090,7 +1091,7 @@ TEST_P(TensorPermuteDevices, Append) { } } -TEST_P(TensorPermuteDevicePairs, CopyNonContiguous) { +TEST_P(TensorPermuteDevicePairsWithSYCL, CopyNonContiguous) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -1127,7 +1128,7 @@ TEST_P(TensorPermuteDevicePairs, CopyNonContiguous) { } } -TEST_P(TensorPermuteDevicePairs, IndexGet) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGet) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1164,7 +1165,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGet) { EXPECT_EQ(src_t.GetDtype(), dst_t.GetDtype()); } -TEST_P(TensorPermuteDevicePairs, IndexGetNegative) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetNegative) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1188,7 +1189,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetNegative) { EXPECT_EQ(t_1.ToFlatVector(), std::vector({5, 10, 17, 22})); } -TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndex) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGet2DBroadcastedIndex) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1218,7 +1219,8 @@ TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndex) { 28, 29, 30, 31, 40, 41, 42, 43})); } -TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndexSplitBySlice) { +TEST_P(TensorPermuteDevicePairsWithSYCL, + IndexGet2DBroadcastedIndexSplitBySlice) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1249,7 +1251,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndexSplitBySlice) { 16, 20, 40, 44, 17, 21, 41, 45})); } -TEST_P(TensorPermuteDevicePairs, IndexGetAssignToBroadcast) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetAssignToBroadcast) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -1279,7 +1281,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetAssignToBroadcast) { std::vector({5, 10, 17, 22, 5, 10, 17, 22, 5, 10, 17, 22})); } -TEST_P(TensorPermuteDevicePairs, IndexGetSeparateBySlice) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetSeparateBySlice) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1303,7 +1305,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetSeparateBySlice) { std::vector({0, 4, 8, 13, 17, 21})); } -TEST_P(TensorPermuteDevicePairs, IndexGetSliceEnd) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetSliceEnd) { core::Device idx_device; core::Device src_device; std::tie(idx_device, src_device) = GetParam(); @@ -1326,7 +1328,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetSliceEnd) { std::vector({0, 1, 2, 3, 16, 17, 18, 19})); } -TEST_P(TensorPermuteDevicePairs, IndexSet) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSet) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -1405,7 +1407,7 @@ TEST_P(TensorPermuteDevicePairs, IndexSet) { core::Tensor::Init({10, 11}, src_device))); } -TEST_P(TensorPermuteDevicePairs, IndexSetBroadcast) { +TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSetBroadcast) { core::Device dst_device; core::Device src_device; std::tie(dst_device, src_device) = GetParam(); @@ -1430,7 +1432,7 @@ TEST_P(TensorPermuteDevicePairs, IndexSetBroadcast) { 0, 0, 0, 0, 20, 20, 20, 0, 0, 0, 0, 0})); } -TEST_P(TensorPermuteDevices, IndexAdd_) { +TEST_P(TensorPermuteDevicesWithSYCL, IndexAdd_) { core::Device device = GetParam(); const int tensor_size = 100; @@ -1464,7 +1466,7 @@ TEST_P(TensorPermuteDevices, IndexAdd_) { } } -TEST_P(TensorPermuteDevices, Permute) { +TEST_P(TensorPermuteDevicesWithSYCL, Permute) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -1492,7 +1494,7 @@ TEST_P(TensorPermuteDevices, Permute) { 17, 21, 14, 18, 22, 15, 19, 23})); } -TEST_P(TensorPermuteDevices, Transpose) { +TEST_P(TensorPermuteDevicesWithSYCL, Transpose) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( @@ -1512,7 +1514,7 @@ TEST_P(TensorPermuteDevices, Transpose) { EXPECT_THROW(t.Transpose(3, 5), std::runtime_error); } -TEST_P(TensorPermuteDevices, T) { +TEST_P(TensorPermuteDevicesWithSYCL, T) { core::Device device = GetParam(); std::vector vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, @@ -1532,7 +1534,7 @@ TEST_P(TensorPermuteDevices, T) { EXPECT_THROW(t_3d.T(), std::runtime_error); } -TEST_P(TensorPermuteDevices, Det) { +TEST_P(TensorPermuteDevicesWithSYCL, Det) { core::Device device = GetParam(); // Det supports both Float32 and Float64. core::Dtype dtype = core::Float32; @@ -1556,7 +1558,7 @@ TEST_P(TensorPermuteDevices, Det) { EXPECT_ANY_THROW(core::Tensor::Ones({3, 4}, dtype, device).Det()); } -TEST_P(TensorPermuteDevices, ShallowCopyConstructor) { +TEST_P(TensorPermuteDevicesWithSYCL, ShallowCopyConstructor) { core::Device device = GetParam(); core::Tensor t({2, 3}, core::Float32, device); @@ -1579,7 +1581,7 @@ TEST_P(TensorPermuteDevices, ShallowCopyConstructor) { EXPECT_EQ(t.GetDataPtr(), FirstTensorDataPtr({t})); } -TEST_P(TensorPermuteDevices, AdvancedIndexing_IsIndexSplittedBySlice) { +TEST_P(TensorPermuteDevicesWithSYCL, AdvancedIndexing_IsIndexSplittedBySlice) { core::Device device = GetParam(); core::Tensor idx = core::Tensor::Init({1, 2}, device); @@ -1602,7 +1604,7 @@ TEST_P(TensorPermuteDevices, AdvancedIndexing_IsIndexSplittedBySlice) { {idx, slice, slice, idx})); } -TEST_P(TensorPermuteDevices, Add) { +TEST_P(TensorPermuteDevicesWithSYCL, Add) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1, 2}, {3, 4, 5}}, device); core::Tensor b = @@ -1612,7 +1614,7 @@ TEST_P(TensorPermuteDevices, Add) { std::vector({10, 12, 14, 16, 18, 20})); } -TEST_P(TensorPermuteDevices, Add_) { +TEST_P(TensorPermuteDevicesWithSYCL, Add_) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1, 2}, {3, 4, 5}}, device); core::Tensor b = @@ -1622,7 +1624,7 @@ TEST_P(TensorPermuteDevices, Add_) { std::vector({10, 12, 14, 16, 18, 20})); } -TEST_P(TensorPermuteDevices, Add_BroadcastException) { +TEST_P(TensorPermuteDevicesWithSYCL, Add_BroadcastException) { // A.shape = ( 3, 4) // B.shape = (2, 3, 4) // A += B should throw exception. @@ -1642,7 +1644,7 @@ TEST_P(TensorPermuteDevices, Add_BroadcastException) { 20, 22, 24, 26, 28, 30, 32, 34})); } -TEST_P(TensorPermuteDevices, Sub) { +TEST_P(TensorPermuteDevicesWithSYCL, Sub) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{10, 12, 14}, {16, 18, 20}}, device); @@ -1652,7 +1654,7 @@ TEST_P(TensorPermuteDevices, Sub) { std::vector({10, 11, 12, 13, 14, 15})); } -TEST_P(TensorPermuteDevices, Sub_) { +TEST_P(TensorPermuteDevicesWithSYCL, Sub_) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{10, 12, 14}, {16, 18, 20}}, device); @@ -1662,7 +1664,7 @@ TEST_P(TensorPermuteDevices, Sub_) { std::vector({10, 11, 12, 13, 14, 15})); } -TEST_P(TensorPermuteDevices, Mul) { +TEST_P(TensorPermuteDevicesWithSYCL, Mul) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1, 2}, {3, 4, 5}}, device); core::Tensor b = @@ -1672,7 +1674,7 @@ TEST_P(TensorPermuteDevices, Mul) { std::vector({0, 7, 16, 27, 40, 55})); } -TEST_P(TensorPermuteDevices, Mul_) { +TEST_P(TensorPermuteDevicesWithSYCL, Mul_) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1, 2}, {3, 4, 5}}, device); core::Tensor b = @@ -1682,27 +1684,31 @@ TEST_P(TensorPermuteDevices, Mul_) { std::vector({0, 7, 16, 27, 40, 55})); } -TEST_P(TensorPermuteDevices, Div) { +TEST_P(TensorPermuteDevicesWithSYCL, Div) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 7, 16}, {27, 40, 55}}, device); core::Tensor b = core::Tensor::Init({{6, 7, 8}, {9, 10, 11}}, device); core::Tensor c = a / b; - EXPECT_EQ(c.ToFlatVector(), std::vector({0, 1, 2, 3, 4, 5})); + core::Tensor c_ref = core::Tensor(std::vector{0, 1, 2, 3, 4, 5}, + {2, 3}, core::Float32, device); + EXPECT_TRUE(c.AllClose(c_ref)); } -TEST_P(TensorPermuteDevices, Div_) { +TEST_P(TensorPermuteDevicesWithSYCL, Div_) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 7, 16}, {27, 40, 55}}, device); core::Tensor b = core::Tensor::Init({{6, 7, 8}, {9, 10, 11}}, device); a /= b; - EXPECT_EQ(a.ToFlatVector(), std::vector({0, 1, 2, 3, 4, 5})); + core::Tensor a_ref = core::Tensor(std::vector{0, 1, 2, 3, 4, 5}, + {2, 3}, core::Float32, device); + EXPECT_TRUE(a.AllClose(a_ref)); } -TEST_P(TensorPermuteDevices, ReduceSumKeepDim) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumKeepDim) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{{22.f, 23.f, 20.f, 9.f}, {6.f, 14.f, 18.f, 13.f}, @@ -1774,7 +1780,7 @@ TEST_P(TensorPermuteDevices, ReduceSumKeepDim) { EXPECT_THROW(src.Sum({2, -1}, true), std::runtime_error); // Repeated. } -TEST_P(TensorPermuteDevices, ReduceSumNotKeepDim) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumNotKeepDim) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{{22.f, 23.f, 20.f, 9.f}, {6.f, 14.f, 18.f, 13.f}, @@ -1829,7 +1835,7 @@ TEST_P(TensorPermuteDevices, ReduceSumNotKeepDim) { EXPECT_EQ(dst.ToFlatVector(), std::vector({276.f})); } -TEST_P(TensorPermuteDevices, ReduceSumSpecialShapes) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumSpecialShapes) { core::Device device = GetParam(); core::Tensor src; core::Tensor dst; @@ -1910,22 +1916,23 @@ TEST_P(TensorPermuteDevices, ReduceSumSpecialShapes) { TEST_P(TensorPermuteDevices, ReduceMultipleOutputsSumLargeArray) { core::Device device = GetParam(); - core::SizeVector shape{3, 7, 8234719}; + constexpr int64_t large = 8234719; + core::SizeVector shape{3, 7, large}; int64_t size = shape.NumElements(); std::vector vals(size, 1); core::Tensor src(vals, shape, core::Int32, device); core::Tensor dst; dst = src.Sum({}, false); - EXPECT_EQ(dst.GetShape(), core::SizeVector({3, 7, 8234719})); - EXPECT_EQ(dst.ToFlatVector(), std::vector(3 * 7 * 8234719, 1)); + EXPECT_EQ(dst.GetShape(), core::SizeVector({3, 7, large})); + EXPECT_EQ(dst.ToFlatVector(), std::vector(3 * 7 * large, 1)); dst = src.Sum({0}, false); - EXPECT_EQ(dst.GetShape(), core::SizeVector({7, 8234719})); - EXPECT_EQ(dst.ToFlatVector(), std::vector(7 * 8234719, 3)); + EXPECT_EQ(dst.GetShape(), core::SizeVector({7, large})); + EXPECT_EQ(dst.ToFlatVector(), std::vector(7 * large, 3)); } -TEST_P(TensorPermuteDevices, ReduceSum64bit1D) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit1D) { core::Device device = GetParam(); // num_bytes = 8 * (2 ^ 28) + 1 = 2 ^ 31 + 1 ~= 2GB // max_offsets = num_bytes - 1 = 2 ^ 31 @@ -2038,7 +2045,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase3) { std::vector(large_dim - 30, 2)); } -TEST_P(TensorPermuteDevices, ReduceSumLargeArray) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumLargeArray) { core::Device device = GetParam(); std::vector sizes = TensorSizes::TestCases(); @@ -2060,7 +2067,7 @@ TEST_P(TensorPermuteDevices, ReduceSumLargeArray) { } } -TEST_P(TensorPermuteDevices, ReduceProd) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceProd) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{{22.f, 23.f, 20.f, 9.f}, {6.f, 14.f, 18.f, 13.f}, @@ -2117,7 +2124,7 @@ TEST_P(TensorPermuteDevices, ReduceProd) { EXPECT_EQ(dst.ToFlatVector(), std::vector({0.f})); } -TEST_P(TensorPermuteDevices, ReduceMin) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceMin) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{{22.f, 23.f, 20.f, 9.f}, {6.f, 14.f, 18.f, 13.f}, @@ -2170,7 +2177,7 @@ TEST_P(TensorPermuteDevices, ReduceMin) { EXPECT_EQ(dst.ToFlatVector(), std::vector({0.f})); } -TEST_P(TensorPermuteDevices, ReduceMax) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceMax) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{{22.f, 23.f, 20.f, 9.f}, {6.f, 14.f, 18.f, 13.f}, @@ -2225,7 +2232,7 @@ TEST_P(TensorPermuteDevices, ReduceMax) { EXPECT_EQ(dst.ToFlatVector(), std::vector({23.f})); } -TEST_P(TensorPermuteDevices, ReduceMaxFloatLimit) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceMaxFloatLimit) { // std::numeric_limits should use lowest() instead of min(). core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({-2.f, -1.f}, device); @@ -2237,8 +2244,13 @@ TEST_P(TensorPermuteDevices, ReduceMaxFloatLimit) { EXPECT_EQ(dst.ToFlatVector(), std::vector({1})); } -TEST_P(TensorPermuteDevices, ReduceArgMin) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceArgMin) { core::Device device = GetParam(); + if (core::sy::GetDeviceType(device) == "cpu") { + GTEST_SKIP() << "allocateMemSubBuffer() API failed with unknown error " + "on CPU."; + } + core::Tensor src = core::Tensor::Init( {{{22, 23, 20, 9}, {6, 14, 18, 13}, {15, 3, 17, 0}}, {{7, 21, 11, 1}, {4, 2, 10, 19}, {5, 8, 16, 12}}}, @@ -2265,8 +2277,12 @@ TEST_P(TensorPermuteDevices, ReduceArgMin) { std::vector({3, 0, 3, 3, 1, 0})); } -TEST_P(TensorPermuteDevices, ReduceArgMax) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceArgMax) { core::Device device = GetParam(); + if (core::sy::GetDeviceType(device) == "cpu") { + GTEST_SKIP() << "allocateMemSubBuffer() API failed with unknown error " + "on CPU."; + } core::Tensor src = core::Tensor::Init( {{{22, 23, 20, 9}, {6, 14, 18, 13}, {15, 3, 17, 0}}, {{7, 21, 11, 1}, {4, 2, 10, 19}, {5, 8, 16, 12}}}, @@ -2293,7 +2309,7 @@ TEST_P(TensorPermuteDevices, ReduceArgMax) { std::vector({1, 2, 2, 1, 3, 2})); } -TEST_P(TensorPermuteDevices, Sqrt) { +TEST_P(TensorPermuteDevicesWithSYCL, Sqrt) { core::Device device = GetParam(); core::Tensor src = core::Tensor::Init({{0, 1, 4}, {9, 16, 25}}, device); @@ -2323,7 +2339,7 @@ TEST_P(TensorPermuteDevices, Sqrt) { std::vector({0, 1, 2, 3, 4, 5})); } -TEST_P(TensorPermuteDevices, Sin) { +TEST_P(TensorPermuteDevicesWithSYCL, Sin) { core::Device device = GetParam(); std::vector src_vals{-2, -1, 0, 1, 2, 3}; @@ -2346,7 +2362,7 @@ TEST_P(TensorPermuteDevices, Sin) { EXPECT_THROW(src.Sin(), std::runtime_error); } -TEST_P(TensorPermuteDevices, Cos) { +TEST_P(TensorPermuteDevicesWithSYCL, Cos) { core::Device device = GetParam(); std::vector src_vals{-2, -1, 0, 1, 2, 3}; @@ -2369,7 +2385,7 @@ TEST_P(TensorPermuteDevices, Cos) { EXPECT_THROW(src.Cos(), std::runtime_error); } -TEST_P(TensorPermuteDevices, Neg) { +TEST_P(TensorPermuteDevicesWithSYCL, Neg) { core::Device device = GetParam(); std::vector dst_vals{2, 1, 0, -1, -2, -3}; @@ -2388,7 +2404,7 @@ TEST_P(TensorPermuteDevices, Neg) { EXPECT_EQ(dst.ToFlatVector(), std::vector({1, 0, -2})); } -TEST_P(TensorPermuteDevices, UnaryMinus) { +TEST_P(TensorPermuteDevicesWithSYCL, UnaryMinus) { core::Device device = GetParam(); std::vector dst_vals{2, 1, 0, -1, -2, -3}; @@ -2403,7 +2419,7 @@ TEST_P(TensorPermuteDevices, UnaryMinus) { EXPECT_EQ(dst.ToFlatVector(), std::vector({1, 0, -2})); } -TEST_P(TensorPermuteDevices, Exp) { +TEST_P(TensorPermuteDevicesWithSYCL, Exp) { core::Device device = GetParam(); std::vector src_vals{-2, -1, 0, 1, 2, 3}; @@ -2426,7 +2442,7 @@ TEST_P(TensorPermuteDevices, Exp) { EXPECT_THROW(src.Exp(), std::runtime_error); } -TEST_P(TensorPermuteDevices, Abs) { +TEST_P(TensorPermuteDevicesWithSYCL, Abs) { core::Device device = GetParam(); std::vector src_vals{-2, -1, 0, 1, 2, 3}; @@ -2444,7 +2460,7 @@ TEST_P(TensorPermuteDevices, Abs) { EXPECT_EQ(src.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, IsNan) { +TEST_P(TensorPermuteDevicesWithSYCL, IsNan) { core::Device device = GetParam(); std::vector src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY}; @@ -2458,7 +2474,7 @@ TEST_P(TensorPermuteDevices, IsNan) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, IsInf) { +TEST_P(TensorPermuteDevicesWithSYCL, IsInf) { core::Device device = GetParam(); std::vector src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY}; @@ -2472,7 +2488,7 @@ TEST_P(TensorPermuteDevices, IsInf) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, IsFinite) { +TEST_P(TensorPermuteDevicesWithSYCL, IsFinite) { core::Device device = GetParam(); std::vector src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY}; @@ -2486,7 +2502,7 @@ TEST_P(TensorPermuteDevices, IsFinite) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, Floor) { +TEST_P(TensorPermuteDevicesWithSYCL, Floor) { core::Device device = GetParam(); std::vector src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5}; @@ -2500,7 +2516,7 @@ TEST_P(TensorPermuteDevices, Floor) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, Ceil) { +TEST_P(TensorPermuteDevicesWithSYCL, Ceil) { core::Device device = GetParam(); std::vector src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5}; @@ -2514,7 +2530,7 @@ TEST_P(TensorPermuteDevices, Ceil) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, Round) { +TEST_P(TensorPermuteDevicesWithSYCL, Round) { core::Device device = GetParam(); std::vector src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5}; @@ -2528,7 +2544,7 @@ TEST_P(TensorPermuteDevices, Round) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, Trunc) { +TEST_P(TensorPermuteDevicesWithSYCL, Trunc) { core::Device device = GetParam(); std::vector src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5}; @@ -2542,7 +2558,7 @@ TEST_P(TensorPermuteDevices, Trunc) { EXPECT_EQ(dst.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, LogicalNot) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalNot) { core::Device device = GetParam(); std::vector src_vals{true, false, true, false}; @@ -2560,7 +2576,7 @@ TEST_P(TensorPermuteDevices, LogicalNot) { EXPECT_EQ(src.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, LogicalNotFloat) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalNotFloat) { core::Device device = GetParam(); std::vector src_vals{0, -1, 1, 2}; @@ -2584,7 +2600,7 @@ TEST_P(TensorPermuteDevices, LogicalNotFloat) { EXPECT_EQ(src.ToFlatVector(), dst_vals); } -TEST_P(TensorPermuteDevices, LogicalAnd) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalAnd) { core::Device device = GetParam(); core::Tensor a(std::vector({true, false, true, false}), {2, 2}, core::Bool, device); @@ -2603,7 +2619,7 @@ TEST_P(TensorPermuteDevices, LogicalAnd) { std::vector({true, false, false, false})); } -TEST_P(TensorPermuteDevices, LogicalAndFloat) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalAndFloat) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{-1, 0}, {1, 0}}, device); core::Tensor b = core::Tensor::Init({{1, 0}, {0, 0}}, device); @@ -2616,7 +2632,7 @@ TEST_P(TensorPermuteDevices, LogicalAndFloat) { EXPECT_EQ(a.ToFlatVector(), std::vector({1, 0, 0, 0})); } -TEST_P(TensorPermuteDevices, LogicalOr) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalOr) { core::Device device = GetParam(); core::Tensor a(std::vector({true, false, true, false}), {2, 2}, core::Bool, device); @@ -2635,7 +2651,7 @@ TEST_P(TensorPermuteDevices, LogicalOr) { std::vector({true, true, true, false})); } -TEST_P(TensorPermuteDevices, LogicalOrFloat) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalOrFloat) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{-1, 0}, {1, 0}}, device); core::Tensor b = core::Tensor::Init({{1, -1}, {0, 0}}, device); @@ -2648,7 +2664,7 @@ TEST_P(TensorPermuteDevices, LogicalOrFloat) { EXPECT_EQ(a.ToFlatVector(), std::vector({1, 1, 1, 0})); } -TEST_P(TensorPermuteDevices, LogicalXor) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalXor) { core::Device device = GetParam(); core::Tensor a(std::vector({true, false, true, false}), {2, 2}, core::Bool, device); @@ -2664,7 +2680,7 @@ TEST_P(TensorPermuteDevices, LogicalXor) { std::vector({false, true, true, false})); } -TEST_P(TensorPermuteDevices, LogicalXorFloat) { +TEST_P(TensorPermuteDevicesWithSYCL, LogicalXorFloat) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{-1, 0}, {1, 0}}, device); core::Tensor b = core::Tensor::Init({{1, -1}, {0, 0}}, device); @@ -2677,7 +2693,7 @@ TEST_P(TensorPermuteDevices, LogicalXorFloat) { EXPECT_EQ(a.ToFlatVector(), std::vector({0, 1, 1, 0})); } -TEST_P(TensorPermuteDevices, Gt) { +TEST_P(TensorPermuteDevicesWithSYCL, Gt) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); core::Tensor b = core::Tensor::Init({{0, 0}, {0, 2}}, device); @@ -2693,7 +2709,7 @@ TEST_P(TensorPermuteDevices, Gt) { EXPECT_EQ(a.ToFlatVector(), std::vector({0, 1, 0, 0})); } -TEST_P(TensorPermuteDevices, Lt) { +TEST_P(TensorPermuteDevicesWithSYCL, Lt) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); core::Tensor b = core::Tensor::Init({{0, 0}, {0, 2}}, device); @@ -2709,7 +2725,7 @@ TEST_P(TensorPermuteDevices, Lt) { EXPECT_EQ(a.ToFlatVector(), std::vector({0, 0, 1, 1})); } -TEST_P(TensorPermuteDevices, Ge) { +TEST_P(TensorPermuteDevicesWithSYCL, Ge) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); core::Tensor b = core::Tensor::Init({{0, 0}, {0, 2}}, device); @@ -2725,7 +2741,7 @@ TEST_P(TensorPermuteDevices, Ge) { EXPECT_EQ(a.ToFlatVector(), std::vector({1, 1, 0, 0})); } -TEST_P(TensorPermuteDevices, Le) { +TEST_P(TensorPermuteDevicesWithSYCL, Le) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); core::Tensor b = core::Tensor::Init({{0, 0}, {0, 2}}, device); @@ -2741,7 +2757,7 @@ TEST_P(TensorPermuteDevices, Le) { EXPECT_EQ(a.ToFlatVector(), std::vector({1, 0, 1, 1})); } -TEST_P(TensorPermuteDevices, Eq) { +TEST_P(TensorPermuteDevicesWithSYCL, Eq) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); core::Tensor b = core::Tensor::Init({{0, 0}, {0, 2}}, device); @@ -2757,7 +2773,7 @@ TEST_P(TensorPermuteDevices, Eq) { EXPECT_EQ(a.ToFlatVector(), std::vector({1, 0, 0, 0})); } -TEST_P(TensorPermuteDevices, Ne) { +TEST_P(TensorPermuteDevicesWithSYCL, Ne) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Init({{0, 1}, {-1, 1}}, device); @@ -2774,7 +2790,7 @@ TEST_P(TensorPermuteDevices, Ne) { EXPECT_EQ(a.ToFlatVector(), std::vector({0, 1, 1, 1})); } -TEST_P(TensorPermuteDevices, BooleanIndex) { +TEST_P(TensorPermuteDevicesWithSYCL, BooleanIndex) { core::Device device = GetParam(); // a[a < 0] = 0 @@ -2798,7 +2814,7 @@ TEST_P(TensorPermuteDevices, BooleanIndex) { EXPECT_EQ(y.GetDtype(), core::Float32); } -TEST_P(TensorPermuteDevices, NonZeroNumpy) { +TEST_P(TensorPermuteDevicesWithSYCL, NonZeroNumpy) { core::Device device = GetParam(); core::Tensor a = @@ -2812,7 +2828,7 @@ TEST_P(TensorPermuteDevices, NonZeroNumpy) { EXPECT_EQ(results[1].GetShape(), core::SizeVector{3}); } -TEST_P(TensorPermuteDevices, All) { +TEST_P(TensorPermuteDevicesWithSYCL, All) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( {{false, true}, {true, false}, {true, false}, {true, true}}, @@ -2843,7 +2859,7 @@ TEST_P(TensorPermuteDevices, All) { EXPECT_ANY_THROW(t.All(core::SizeVector({2}))); } -TEST_P(TensorPermuteDevices, Any) { +TEST_P(TensorPermuteDevicesWithSYCL, Any) { core::Device device = GetParam(); core::Tensor t = core::Tensor::Init( {{false, true}, {true, false}, {true, false}, {true, true}}, @@ -2875,7 +2891,7 @@ TEST_P(TensorPermuteDevices, Any) { EXPECT_ANY_THROW(t.Any(core::SizeVector({2}))); } -TEST_P(TensorPermuteDevices, CreationEmpty) { +TEST_P(TensorPermuteDevicesWithSYCL, CreationEmpty) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Empty({}, core::Float32, device); @@ -2899,7 +2915,7 @@ TEST_P(TensorPermuteDevices, CreationEmpty) { EXPECT_EQ(a.NumElements(), 6); } -TEST_P(TensorPermuteDevices, CreationFull) { +TEST_P(TensorPermuteDevicesWithSYCL, CreationFull) { core::Device device = GetParam(); const float fill_value = 100; @@ -2934,7 +2950,7 @@ TEST_P(TensorPermuteDevices, CreationFull) { std::vector(a.NumElements(), fill_value)); } -TEST_P(TensorPermuteDevices, CreationZeros) { +TEST_P(TensorPermuteDevicesWithSYCL, CreationZeros) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Zeros({2, 3}, core::Float32, device); @@ -2943,7 +2959,7 @@ TEST_P(TensorPermuteDevices, CreationZeros) { EXPECT_EQ(a.ToFlatVector(), std::vector(a.NumElements(), 0)); } -TEST_P(TensorPermuteDevices, CreationOnes) { +TEST_P(TensorPermuteDevicesWithSYCL, CreationOnes) { core::Device device = GetParam(); core::Tensor a = core::Tensor::Ones({2, 3}, core::Float32, device); @@ -2952,7 +2968,7 @@ TEST_P(TensorPermuteDevices, CreationOnes) { EXPECT_EQ(a.ToFlatVector(), std::vector(a.NumElements(), 1)); } -TEST_P(TensorPermuteDevices, ScalarOperatorOverload) { +TEST_P(TensorPermuteDevicesWithSYCL, ScalarOperatorOverload) { core::Device device = GetParam(); core::Tensor a; core::Tensor b; @@ -3038,7 +3054,7 @@ TEST_P(TensorPermuteDevices, ScalarOperatorOverload) { EXPECT_EQ(a.ToFlatVector(), std::vector({5, 5})); } -TEST_P(TensorPermuteDevices, ReduceMean) { +TEST_P(TensorPermuteDevicesWithSYCL, ReduceMean) { core::Device device = GetParam(); core::Tensor src; core::Tensor dst; @@ -3129,8 +3145,9 @@ TEST_P(TensorPermuteDevices, ReduceMean) { EXPECT_TRUE(std::isnan(dst.ToFlatVector()[0])); } -TEST_P(TensorPermuteDevices, ToDLPackFromDLPack) { +TEST_P(TensorPermuteDevicesWithSYCL, ToDLPackFromDLPack) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; core::Tensor src_t = core::Tensor::Init( {{{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}}, {{12, 13, 14, 15}, {16, 17, 18, 19}, {20, 21, 22, 23}}}, @@ -3162,7 +3179,7 @@ TEST_P(TensorPermuteDevices, ToDLPackFromDLPack) { std::vector({12, 14, 20, 22})); } -TEST_P(TensorPermuteDevices, IsSame) { +TEST_P(TensorPermuteDevicesWithSYCL, IsSame) { core::Device device = GetParam(); // "Shallow" copy. @@ -3206,7 +3223,7 @@ TEST_P(TensorPermuteDevices, IsSame) { EXPECT_TRUE(vec[0].IsSame(vec[1])); } -TEST_P(TensorPermuteDevices, RValueScalar) { +TEST_P(TensorPermuteDevicesWithSYCL, RValueScalar) { const core::Device &device = GetParam(); core::Tensor t, t_ref; @@ -3271,7 +3288,7 @@ TEST_P(TensorPermuteDevices, RValueScalar) { EXPECT_TRUE(t.AllClose(t_ref)); } -TEST_P(TensorPermuteDevices, Clip) { +TEST_P(TensorPermuteDevicesWithSYCL, Clip) { core::Device device = GetParam(); core::Tensor t, t_clip, t_ref; @@ -3324,7 +3341,7 @@ TEST_P(TensorPermuteDevices, Clip) { EXPECT_TRUE(t_clip.AllClose(t_ref)); } -TEST_P(TensorPermuteDevices, Clip_) { +TEST_P(TensorPermuteDevicesWithSYCL, Clip_) { core::Device device = GetParam(); core::Tensor t, t_ref; @@ -3414,7 +3431,7 @@ TEST_P(TensorPermuteDevicePairs, AllEqual) { EXPECT_FALSE(src.AllEqual(dst)); } -TEST_P(TensorPermuteDevices, Iterator) { +TEST_P(TensorPermuteDevicesWithSYCL, Iterator) { core::Device device = GetParam(); core::Tensor t; @@ -3492,7 +3509,7 @@ TEST_P(TensorPermuteDevices, Iterator) { } } -TEST_P(TensorPermuteDevices, ConstIterator) { +TEST_P(TensorPermuteDevicesWithSYCL, ConstIterator) { core::Device device = GetParam(); core::Tensor t; @@ -3565,7 +3582,7 @@ TEST_P(TensorPermuteDevices, ConstIterator) { } } -TEST_P(TensorPermuteDevices, TakeOwnership) { +TEST_P(TensorPermuteDevicesWithSYCL, TakeOwnership) { core::Device device = GetParam(); if (!device.IsCPU()) { GTEST_SKIP(); diff --git a/cpp/tests/core/TensorCheck.cpp b/cpp/tests/core/TensorCheck.cpp index 8b789b481b6..14fb471d874 100644 --- a/cpp/tests/core/TensorCheck.cpp +++ b/cpp/tests/core/TensorCheck.cpp @@ -14,10 +14,11 @@ namespace open3d { namespace tests { -class TensorCheckPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(Tensor, - TensorCheckPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TensorCheckPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + Tensor, + TensorCheckPermuteDevices, + testing::ValuesIn(TensorCheckPermuteDevices::TestCases())); TEST_P(TensorCheckPermuteDevices, AssertTensorDtype) { core::Device device = GetParam(); diff --git a/cpp/tests/core/TensorFunction.cpp b/cpp/tests/core/TensorFunction.cpp index 0d25b941dfd..17fab12edc9 100644 --- a/cpp/tests/core/TensorFunction.cpp +++ b/cpp/tests/core/TensorFunction.cpp @@ -15,10 +15,11 @@ namespace open3d { namespace tests { -class TensorFunctionPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(Tensor, - TensorFunctionPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TensorFunctionPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + Tensor, + TensorFunctionPermuteDevices, + testing::ValuesIn(TensorFunctionPermuteDevices::TestCases())); TEST_P(TensorFunctionPermuteDevices, Concatenate) { core::Device device = GetParam(); @@ -105,7 +106,7 @@ TEST_P(TensorFunctionPermuteDevices, Concatenate) { // Taking the above case of [1, 2] to [2, 2] with different dtype and // device. EXPECT_ANY_THROW(core::Concatenate({a, b.To(core::Float64), c})); - if (device.IsCUDA()) { + if (!device.IsCPU()) { EXPECT_ANY_THROW( core::Concatenate({a, b.To(core::Device("CPU:0")), c})); } @@ -205,7 +206,7 @@ TEST_P(TensorFunctionPermuteDevices, Append) { // Taking the above case of [1, 2] to [2, 2] with different dtype and // device. EXPECT_ANY_THROW(core::Append(self, other.To(core::Float64))); - if (device.IsCUDA()) { + if (!device.IsCPU()) { EXPECT_ANY_THROW(core::Append(self, other.To(core::Device("CPU:0")))); } diff --git a/cpp/tests/core/TensorList.cpp b/cpp/tests/core/TensorList.cpp index 39091f3d386..67d819a8c9a 100644 --- a/cpp/tests/core/TensorList.cpp +++ b/cpp/tests/core/TensorList.cpp @@ -15,10 +15,11 @@ namespace open3d { namespace tests { -class TensorListPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(TensorList, - TensorListPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TensorListPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + TensorList, + TensorListPermuteDevices, + testing::ValuesIn(TensorListPermuteDevices::TestCases())); TEST_P(TensorListPermuteDevices, EmptyConstructor) { core::Device device = GetParam(); diff --git a/cpp/tests/core/TensorObject.cpp b/cpp/tests/core/TensorObject.cpp index 0ff294dbc88..62c537efabf 100644 --- a/cpp/tests/core/TensorObject.cpp +++ b/cpp/tests/core/TensorObject.cpp @@ -22,12 +22,13 @@ namespace open3d { namespace tests { -class TensorObjectPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(TensorObject, - TensorObjectPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TensorObjectPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + TensorObject, + TensorObjectPermuteDevices, + testing::ValuesIn(TensorObjectPermuteDevices::TestCases())); -class TensorObjectPermuteDevicePairs : public PermuteDevicePairs {}; +class TensorObjectPermuteDevicePairs : public PermuteDevicePairsWithSYCL {}; INSTANTIATE_TEST_SUITE_P( TensorObject, TensorObjectPermuteDevicePairs, diff --git a/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp b/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp index 2139b40263c..e62f8b89a68 100644 --- a/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp +++ b/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp @@ -19,12 +19,14 @@ namespace open3d { namespace tests { -class AxisAlignedBoundingBoxPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(AxisAlignedBoundingBox, - AxisAlignedBoundingBoxPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class AxisAlignedBoundingBoxPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + AxisAlignedBoundingBox, + AxisAlignedBoundingBoxPermuteDevices, + testing::ValuesIn(AxisAlignedBoundingBoxPermuteDevices::TestCases())); -class AxisAlignedBoundingBoxPermuteDevicePairs : public PermuteDevicePairs {}; +class AxisAlignedBoundingBoxPermuteDevicePairs + : public PermuteDevicePairsWithSYCL {}; INSTANTIATE_TEST_SUITE_P( AxisAlignedBoundingBox, AxisAlignedBoundingBoxPermuteDevicePairs, @@ -276,6 +278,7 @@ TEST_P(AxisAlignedBoundingBoxPermuteDevices, GetBoxPoints) { TEST_P(AxisAlignedBoundingBoxPermuteDevices, GetPointIndicesWithinBoundingBox) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; core::Tensor min_bound = core::Tensor::Init({-1, -1, -1}, device); core::Tensor max_bound = core::Tensor::Init({1, 1, 1}, device); diff --git a/cpp/tests/t/geometry/LineSet.cpp b/cpp/tests/t/geometry/LineSet.cpp index 30c86eaa78f..272761e3169 100644 --- a/cpp/tests/t/geometry/LineSet.cpp +++ b/cpp/tests/t/geometry/LineSet.cpp @@ -16,12 +16,12 @@ namespace open3d { namespace tests { -class LineSetPermuteDevices : public PermuteDevices {}; +class LineSetPermuteDevices : public PermuteDevicesWithSYCL {}; INSTANTIATE_TEST_SUITE_P(LineSet, LineSetPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); + testing::ValuesIn(LineSetPermuteDevices::TestCases())); -class LineSetPermuteDevicePairs : public PermuteDevicePairs {}; +class LineSetPermuteDevicePairs : public PermuteDevicePairsWithSYCL {}; INSTANTIATE_TEST_SUITE_P( LineSet, LineSetPermuteDevicePairs, @@ -277,6 +277,7 @@ TEST_P(LineSetPermuteDevices, GetMinBound_GetMaxBound_GetCenter) { TEST_P(LineSetPermuteDevices, Transform) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; t::geometry::LineSet lineset(device); core::Tensor transformation = core::Tensor::Init( @@ -333,6 +334,7 @@ TEST_P(LineSetPermuteDevices, Scale) { TEST_P(LineSetPermuteDevices, Rotate) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; t::geometry::LineSet lineset(device); core::Tensor rotation = core::Tensor::Init( diff --git a/cpp/tests/t/geometry/OrientedBoundingBox.cpp b/cpp/tests/t/geometry/OrientedBoundingBox.cpp index 5a730f4cfb2..d2bda07db22 100644 --- a/cpp/tests/t/geometry/OrientedBoundingBox.cpp +++ b/cpp/tests/t/geometry/OrientedBoundingBox.cpp @@ -19,12 +19,14 @@ namespace open3d { namespace tests { -class OrientedBoundingBoxPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(OrientedBoundingBox, - OrientedBoundingBoxPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class OrientedBoundingBoxPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + OrientedBoundingBox, + OrientedBoundingBoxPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); -class OrientedBoundingBoxPermuteDevicePairs : public PermuteDevicePairs {}; +class OrientedBoundingBoxPermuteDevicePairs + : public PermuteDevicePairsWithSYCL {}; INSTANTIATE_TEST_SUITE_P( OrientedBoundingBox, OrientedBoundingBoxPermuteDevicePairs, @@ -248,6 +250,7 @@ TEST_P(OrientedBoundingBoxPermuteDevices, GetBoxPoints) { TEST_P(OrientedBoundingBoxPermuteDevices, GetPointIndicesWithinBoundingBox) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; core::Tensor center = core::Tensor::Init({0.5, 0.5, 0.5}, device); core::Tensor rotation = core::Tensor::Eye(3, core::Float32, device); diff --git a/cpp/tests/t/geometry/PointCloud.cpp b/cpp/tests/t/geometry/PointCloud.cpp index 2a74c9c75c1..342aa40eb32 100644 --- a/cpp/tests/t/geometry/PointCloud.cpp +++ b/cpp/tests/t/geometry/PointCloud.cpp @@ -24,12 +24,13 @@ namespace open3d { namespace tests { -class PointCloudPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(PointCloud, - PointCloudPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class PointCloudPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + PointCloud, + PointCloudPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); -class PointCloudPermuteDevicePairs : public PermuteDevicePairs {}; +class PointCloudPermuteDevicePairs : public PermuteDevicePairsWithSYCL {}; INSTANTIATE_TEST_SUITE_P( PointCloud, PointCloudPermuteDevicePairs, @@ -170,6 +171,8 @@ TEST_P(PointCloudPermuteDevices, Copy) { TEST_P(PointCloudPermuteDevices, Transform) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; + core::Dtype dtype = core::Float32; t::geometry::PointCloud pcd(device); core::Tensor transformation( @@ -225,6 +228,7 @@ TEST_P(PointCloudPermuteDevices, Scale) { TEST_P(PointCloudPermuteDevices, Rotate) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; core::Dtype dtype = core::Float32; t::geometry::PointCloud pcd(device); core::Tensor rotation(std::vector{1, 1, 0, 0, 1, 1, 0, 1, 0}, {3, 3}, @@ -245,6 +249,7 @@ TEST_P(PointCloudPermuteDevices, Rotate) { TEST_P(PointCloudPermuteDevices, NormalizeNormals) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; core::Tensor points = core::Tensor::Init({{0, 0, 0}, {0, 0, 1}, @@ -278,6 +283,7 @@ TEST_P(PointCloudPermuteDevices, NormalizeNormals) { TEST_P(PointCloudPermuteDevices, EstimateNormals) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; core::Tensor points = core::Tensor::Init({{0, 0, 0}, {0, 0, 1}, @@ -318,6 +324,7 @@ TEST_P(PointCloudPermuteDevices, EstimateNormals) { TEST_P(PointCloudPermuteDevices, OrientNormalsToAlignWithDirection) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; core::Tensor points = core::Tensor::Init({{0, 0, 0}, {0, 0, 1}, @@ -357,6 +364,7 @@ TEST_P(PointCloudPermuteDevices, OrientNormalsToAlignWithDirection) { TEST_P(PointCloudPermuteDevices, OrientNormalsTowardsCameraLocation) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; core::Tensor points = core::Tensor::Init( {{0, 0, 0}, {0, 1, 0}, {1, 0, 0}, {1, 1, 0}}, device); @@ -644,6 +652,7 @@ TEST_P(PointCloudPermuteDevices, CreateFromRGBDImage) { using ::testing::UnorderedElementsAreArray; core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; float depth_scale = 1000.f, depth_max = 3.f; int stride = 1; core::Tensor im_depth = @@ -691,6 +700,7 @@ TEST_P(PointCloudPermuteDevices, CreateFromRGBDImage) { TEST_P(PointCloudPermuteDevices, CreateFromRGBDOrDepthImageWithNormals) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; if (!t::geometry::Image::HAVE_IPP && device.GetType() == @@ -891,6 +901,7 @@ TEST_P(PointCloudPermuteDevices, SelectByIndex) { TEST_P(PointCloudPermuteDevices, VoxelDownSample) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; // Value test t::geometry::PointCloud pcd_small( @@ -969,6 +980,7 @@ TEST_P(PointCloudPermuteDevices, FarthestPointDownSample) { TEST_P(PointCloudPermuteDevices, RemoveRadiusOutliers) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; const t::geometry::PointCloud pcd_small( core::Tensor::Init({{1.0, 1.0, 1.0}, @@ -994,6 +1006,7 @@ TEST_P(PointCloudPermuteDevices, RemoveRadiusOutliers) { TEST_P(PointCloudPermuteDevices, RemoveStatisticalOutliers) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; data::PCDPointCloud sample_pcd_data; geometry::PointCloud pcd_legacy; @@ -1012,6 +1025,7 @@ TEST_P(PointCloudPermuteDevices, RemoveStatisticalOutliers) { TEST_P(PointCloudPermuteDevices, RemoveDuplicatedPoints) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; const t::geometry::PointCloud pcd_small( core::Tensor::Init({{1.0, 1.0, 1.0}, diff --git a/cpp/tests/t/geometry/TensorMap.cpp b/cpp/tests/t/geometry/TensorMap.cpp index edf80230073..9ae1a37e300 100644 --- a/cpp/tests/t/geometry/TensorMap.cpp +++ b/cpp/tests/t/geometry/TensorMap.cpp @@ -15,10 +15,11 @@ namespace open3d { namespace tests { -class TensorMapPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(TensorMap, - TensorMapPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TensorMapPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + TensorMap, + TensorMapPermuteDevices, + testing::ValuesIn(PermuteDevicesWithSYCL::TestCases())); TEST_P(TensorMapPermuteDevices, Constructor) { core::Dtype dtype = core::Float32; diff --git a/cpp/tests/t/geometry/TriangleMesh.cpp b/cpp/tests/t/geometry/TriangleMesh.cpp index 9fd3837df84..32d53073ded 100644 --- a/cpp/tests/t/geometry/TriangleMesh.cpp +++ b/cpp/tests/t/geometry/TriangleMesh.cpp @@ -26,10 +26,11 @@ namespace open3d { namespace tests { -class TriangleMeshPermuteDevices : public PermuteDevices {}; -INSTANTIATE_TEST_SUITE_P(TriangleMesh, - TriangleMeshPermuteDevices, - testing::ValuesIn(PermuteDevices::TestCases())); +class TriangleMeshPermuteDevices : public PermuteDevicesWithSYCL {}; +INSTANTIATE_TEST_SUITE_P( + TriangleMesh, + TriangleMeshPermuteDevices, + testing::ValuesIn(TriangleMeshPermuteDevices::TestCases())); TEST_P(TriangleMeshPermuteDevices, DefaultConstructor) { t::geometry::TriangleMesh mesh; @@ -252,6 +253,7 @@ TEST_P(TriangleMeshPermuteDevices, Has) { TEST_P(TriangleMeshPermuteDevices, Transform) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; t::geometry::TriangleMesh mesh(device); core::Tensor transformation = core::Tensor::Init( @@ -318,6 +320,7 @@ TEST_P(TriangleMeshPermuteDevices, Scale) { TEST_P(TriangleMeshPermuteDevices, Rotate) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; t::geometry::TriangleMesh mesh(device); core::Tensor rotation = core::Tensor::Init( @@ -338,6 +341,7 @@ TEST_P(TriangleMeshPermuteDevices, Rotate) { TEST_P(TriangleMeshPermuteDevices, NormalizeNormals) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; std::shared_ptr mesh = open3d::geometry::TriangleMesh::CreateSphere(1.0, 3); @@ -356,6 +360,7 @@ TEST_P(TriangleMeshPermuteDevices, NormalizeNormals) { TEST_P(TriangleMeshPermuteDevices, ComputeTriangleNormals) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; std::shared_ptr mesh = open3d::geometry::TriangleMesh::CreateSphere(1.0, 3); @@ -371,6 +376,7 @@ TEST_P(TriangleMeshPermuteDevices, ComputeTriangleNormals) { TEST_P(TriangleMeshPermuteDevices, ComputeVertexNormals) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; std::shared_ptr mesh = open3d::geometry::TriangleMesh::CreateSphere(1.0, 3); @@ -1223,6 +1229,8 @@ TEST_P(TriangleMeshPermuteDevices, SelectByIndex) { TEST_P(TriangleMeshPermuteDevices, RemoveUnreferencedVertices) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; + t::geometry::TriangleMesh mesh_empty{device}; // check completely empty mesh @@ -1353,9 +1361,11 @@ TEST_P(TriangleMeshPermuteDevices, RemoveUnreferencedVertices) { TEST_P(TriangleMeshPermuteDevices, ProjectImagesToAlbedo) { using namespace t::geometry; + using ::testing::AnyOf; using ::testing::ElementsAre; using ::testing::FloatEq; core::Device device = GetParam(); + if (!device.IsCPU() || !Image::HAVE_IPP) GTEST_SKIP() << "Not Implemented!"; TriangleMesh sphere = TriangleMesh::FromLegacy(*geometry::TriangleMesh::CreateSphere( 1.0, 20, /*create_uv_map=*/true)); @@ -1397,11 +1407,19 @@ TEST_P(TriangleMeshPermuteDevices, ProjectImagesToAlbedo) { .To(core::Float32) .Mean({0, 1}) .ToFlatVector(), - ElementsAre(FloatEq(87.8693), FloatEq(67.538), FloatEq(64.31))); + AnyOf(ElementsAre(FloatEq(87.8693), FloatEq(67.538), + FloatEq(64.31)), // macOS + ElementsAre(FloatEq(87.8758), + FloatEq(67.5518), // Linux / Windows + FloatEq(64.3254))) + + ); } // namespace tests TEST_P(TriangleMeshPermuteDevices, ComputeTriangleAreas) { core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; + t::geometry::TriangleMesh mesh_empty; EXPECT_NO_THROW(mesh_empty.ComputeTriangleAreas()); @@ -1424,6 +1442,8 @@ TEST_P(TriangleMeshPermuteDevices, ComputeTriangleAreas) { TEST_P(TriangleMeshPermuteDevices, RemoveNonManifoldEdges) { using ::testing::UnorderedElementsAreArray; core::Device device = GetParam(); + if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!"; + t::geometry::TriangleMesh mesh_empty(device); EXPECT_TRUE(mesh_empty.RemoveNonManifoldEdges().IsEmpty()); diff --git a/cpp/tests/t/io/ImageIO.cpp b/cpp/tests/t/io/ImageIO.cpp index db6be27041c..2c614481bd8 100644 --- a/cpp/tests/t/io/ImageIO.cpp +++ b/cpp/tests/t/io/ImageIO.cpp @@ -166,7 +166,7 @@ TEST(ImageIO, WriteImageToJPG) { EXPECT_TRUE(img.AsTensor().AllClose(read_img.AsTensor())); } -// JPG supports only UInt8, and PNG supports both UInt8 and UInt16. +// JPG supports only UInt8, and PNG supports Bool, UInt8 and UInt16. // All other data types are expected to fail. TEST(ImageIO, DifferentDtype) { const std::string tmp_path = utility::filesystem::GetTempDirectoryPath(); @@ -210,9 +210,8 @@ TEST(ImageIO, DifferentDtype) { EXPECT_FALSE( t::io::WriteImage(tmp_path + "/test_imageio_dtype.png", t::geometry::Image(100, 200, 3, core::Int64))); - EXPECT_FALSE( - t::io::WriteImage(tmp_path + "/test_imageio_dtype.png", - t::geometry::Image(100, 200, 3, core::Bool))); + EXPECT_TRUE(t::io::WriteImage(tmp_path + "/test_imageio_dtype.png", + t::geometry::Image(100, 200, 3, core::Bool))); } TEST(ImageIO, CornerCases) { diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 189fa6c50b0..c15dabae54a 100755 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -19,17 +19,17 @@ ARG PACKAGE ARG BUILD_SYCL_MODULE ARG CI -RUN if [ -z "${DEVELOPER_BUILD}" ]; then echo "Error: ARG DEVELOPER_BUILD not specified."; exit 1; fi \ - && if [ -z "${CCACHE_TAR_NAME}" ]; then echo "Error: ARG CCACHE_TAR_NAME not specified."; exit 1; fi \ - && if [ -z "${CMAKE_VERSION}" ]; then echo "Error: ARG CMAKE_VERSION not specified."; exit 1; fi \ - && if [ -z "${CCACHE_VERSION}" ]; then echo "Error: ARG CCACHE_VERSION not specified."; exit 1; fi \ - && if [ -z "${PYTHON_VERSION}" ]; then echo "Error: ARG PYTHON_VERSION not specified."; exit 1; fi \ - && if [ -z "${BUILD_SHARED_LIBS}" ]; then echo "Error: ARG BUILD_SHARED_LIBS not specified."; exit 1; fi \ - && if [ -z "${BUILD_CUDA_MODULE}" ]; then echo "Error: ARG BUILD_CUDA_MODULE not specified."; exit 1; fi \ - && if [ -z "${BUILD_TENSORFLOW_OPS}" ]; then echo "Error: ARG BUILD_TENSORFLOW_OPS not specified."; exit 1; fi \ - && if [ -z "${BUILD_PYTORCH_OPS}" ]; then echo "Error: ARG BUILD_PYTORCH_OPS not specified."; exit 1; fi \ - && if [ -z "${PACKAGE}" ]; then echo "Error: ARG PACKAGE not specified."; exit 1; fi \ - && if [ -z "${BUILD_SYCL_MODULE}" ]; then echo "Error: ARG BUILD_SYCL_MODULE not specified."; exit 1; fi +RUN if [[ -z "${DEVELOPER_BUILD}" ]]; then echo "Error: ARG DEVELOPER_BUILD not specified."; exit 1; fi \ + && if [[ -z "${CCACHE_TAR_NAME}" ]]; then echo "Error: ARG CCACHE_TAR_NAME not specified."; exit 1; fi \ + && if [[ -z "${CMAKE_VERSION}" ]]; then echo "Error: ARG CMAKE_VERSION not specified."; exit 1; fi \ + && if [[ -z "${CCACHE_VERSION}" ]]; then echo "Error: ARG CCACHE_VERSION not specified."; exit 1; fi \ + && if [[ -z "${PYTHON_VERSION}" ]]; then echo "Error: ARG PYTHON_VERSION not specified."; exit 1; fi \ + && if [[ -z "${BUILD_SHARED_LIBS}" ]]; then echo "Error: ARG BUILD_SHARED_LIBS not specified."; exit 1; fi \ + && if [[ -z "${BUILD_CUDA_MODULE}" ]]; then echo "Error: ARG BUILD_CUDA_MODULE not specified."; exit 1; fi \ + && if [[ -z "${BUILD_TENSORFLOW_OPS}" ]]; then echo "Error: ARG BUILD_TENSORFLOW_OPS not specified."; exit 1; fi \ + && if [[ -z "${BUILD_PYTORCH_OPS}" ]]; then echo "Error: ARG BUILD_PYTORCH_OPS not specified."; exit 1; fi \ + && if [[ -z "${PACKAGE}" ]]; then echo "Error: ARG PACKAGE not specified."; exit 1; fi \ + && if [[ -z "${BUILD_SYCL_MODULE}" ]]; then echo "Error: ARG BUILD_SYCL_MODULE not specified."; exit 1; fi # Fix Nvidia repo key rotation issue # https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771 @@ -87,7 +87,15 @@ RUN apt-get update && apt-get install -y \ libxmlsec1-dev \ libffi-dev \ liblzma-dev \ + && if [ "${BUILD_SYCL_MODULE}" = "ON" ]; then \ + add-apt-repository -y ppa:ubuntu-toolchain-r/test \ + && apt-get install -y g++-11 \ + && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 \ + && update-alternatives --set g++ /usr/bin/g++-11 \ + && c++ -v; \ + fi \ && rm -rf /var/lib/apt/lists/* +# OneDPL TBB backend requires libstdc++ >= v11. This makes the created wheel UBuntu 22.04+ only. # pyenv # The pyenv python paths are used during docker run, in this way docker run @@ -224,9 +232,9 @@ RUN \ && make VERBOSE=1 -j$(nproc) \ && make install-pip-package -j$(nproc) \ && make install -j$(nproc) \ - && if [ "${PACKAGE}" = "ON" ]; then make package; fi \ - && if [ "${PACKAGE}" = "VIEWER" ]; then make package-Open3DViewer-deb; fi \ - && if [ "${CI:-}a" != "a" ]; then rm -rf _deps assimp embree ipp mkl mkl_install webrtc; fi + && if [[ "${PACKAGE}" = "ON" ]]; then make package; fi \ + && if [[ "${PACKAGE}" = "VIEWER" ]]; then make package-Open3DViewer-deb; fi \ + && if [[ "${CI:-}a" != "a" ]]; then rm -rf _deps assimp embree ipp mkl mkl_install webrtc librealsense; fi # If CI is not null or unset, remove all large build folders to save disk space # Compress ccache folder, move to / directory @@ -236,8 +244,9 @@ RUN ccache -s \ && CCACHE_DIR_PARENT=$(dirname ${CCACHE_DIR}) \ && cd ${CCACHE_DIR_PARENT} \ && tar -caf /${CCACHE_TAR_NAME}.tar.xz ${CCACHE_DIR_NAME} \ - && if [ "${PACKAGE}" = "ON" ]; then mv /root/Open3D/build/package/open3d-devel*.tar.xz /; fi \ - && if [ "${PACKAGE}" = "VIEWER" ]; then mv /root/Open3D/build/package-Open3DViewer-deb/open3d-viewer-*-Linux.deb /; fi \ + && if [[ "${PACKAGE}" = "ON" ]]; then mv /root/Open3D/build/package/open3d-devel*.tar.xz /; fi \ + && if [[ "${PACKAGE}" = "VIEWER" ]]; then mv /root/Open3D/build/package-Open3DViewer-deb/open3d-viewer-*-Linux.deb /; fi \ + && if [[ "${BUILD_SYCL_MODULE}" = "ON" && "${BUILD_SHARED_LIBS}" = "ON" ]]; then mv /root/Open3D/build/lib/python_package/pip_package/open3d-*.whl /; fi \ && ls -alh / RUN echo "Docker build done." diff --git a/docker/docker_build.sh b/docker/docker_build.sh index e6c22bb10df..176657c795b 100755 --- a/docker/docker_build.sh +++ b/docker/docker_build.sh @@ -451,13 +451,17 @@ sycl-shared_export_env() { export BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04 export DEVELOPER_BUILD=ON export CCACHE_TAR_NAME=open3d-ci-sycl - export PYTHON_VERSION=3.8 + export PYTHON_VERSION=3.10 export BUILD_SHARED_LIBS=ON export BUILD_CUDA_MODULE=OFF - export BUILD_TENSORFLOW_OPS=OFF - export BUILD_PYTORCH_OPS=OFF - export PACKAGE=OFF + export BUILD_TENSORFLOW_OPS=ON + export BUILD_PYTORCH_OPS=ON + export PACKAGE=ON export BUILD_SYCL_MODULE=ON + + export IGC_EnableDPEmulation=1 # Enable float64 emulation during compilation + export SYCL_CACHE_PERSISTENT=1 # Cache SYCL kernel binaries. + export OverrideDefaultFP64Settings=1 # Enable double precision emulation at runtime. } sycl-static_export_env() { @@ -468,13 +472,17 @@ sycl-static_export_env() { export BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04 export DEVELOPER_BUILD=ON export CCACHE_TAR_NAME=open3d-ci-sycl - export PYTHON_VERSION=3.8 + export PYTHON_VERSION=3.10 export BUILD_SHARED_LIBS=OFF export BUILD_CUDA_MODULE=OFF export BUILD_TENSORFLOW_OPS=OFF export BUILD_PYTORCH_OPS=OFF export PACKAGE=OFF export BUILD_SYCL_MODULE=ON + + export IGC_EnableDPEmulation=1 # Enable float64 emulation during compilation + export SYCL_CACHE_PERSISTENT=1 # Cache SYCL kernel binaries. + export OverrideDefaultFP64Settings=1 # Enable double precision emulation at runtime. } function main() { diff --git a/docs/index.rst b/docs/index.rst index 09dda4f1dee..daa572d280d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,6 +24,7 @@ Open3D: A Modern Library for 3D Data Processing builddocs docker arm + sycl open3d_ml .. toctree:: diff --git a/docs/sycl.rst b/docs/sycl.rst new file mode 100644 index 00000000000..e17d085f79f --- /dev/null +++ b/docs/sycl.rst @@ -0,0 +1,126 @@ +.. _sycl: + +Cross-platform GPU support (SYCL) +================================= + +From v0.19, Open3D provides an experimental SYCL backend for cross-platform GPU +support. This backend allows Open3D operations to run on many different GPUs, +including integrated GPUs and discrete GPUs from Intel, Nvidia and AMD. We +provide pre-built C++ binaries and Python 3.10 wheels for Linux (Ubuntu 22.04+). + +Enabled features +----------------- + +Many Tensor API operations and Tensor Geometry operations without custom kernels +can now be offloaded to SYCL devices. In addition, HW accelerated raycasting +queries in :py:class:`open3d.t.geometry.RayCastingScene` are also supported. You +will get an error if an operation is not supported. The implementation is tested +on Linux on Intel integrated and discrete GPUs. Currently, a single GPU +(`SYCL:0`, if available) and the CPU (`SYCL:1` if a GPU is available, else +`SYCL:0`) are supported. + +Installation +------------- + +Both C++ binaries and Python wheels (Python 3.10 only for now) can be downloaded +from the Open3D GitHub releases page. For C++, install the `OneAPI runtime +`_ +and (optionally) SYCL runtime for your `Nvidia +`_ or `AMD +`_ GPU. + +For Python, the wheels will automatically install the DPC++ runtime package +(`dpcpp-cpp-rt`). You will also need `libomp5` installed: `apt-get install +libomp5-11`. Make sure to have the `correct drivers installed +`_ for your GPU. For +raycasting on Intel GPUs, you will also need the +`intel-level-zero-gpu-raytracing` package. + +Usage +------ + +The SYCL backend requires the new CXX11 ABI (Linux, gcc, libstdc++ only). If you +need to use the Open3D PyTorch extension, you should use cxx11_abi wheels for +PyTorch: + +.. code-block:: shell + + pip install torch==2.2.2+cpu.cxx11.abi -i https://download.pytorch.org/whl/cpu/ + +Some GPUs do not have native double precision support. For Intel GPUs, you can +emulate support with these environment variables: + +.. code-block:: shell + + export IGC_EnableDPEmulation=1 # Enable float64 emulation during compilation + export OverrideDefaultFP64Settings=1 # Enable double precision emulation at runtime. + +The binaries only contain kernels compiled to SPIR-V IR. At runtime, they will +be JIT compiled to your target GPU's native ISA. This means that the first run +of a kernel on a new device will be slower than subsequent runs. Use this +environment variable to cache the JIT compiled kernels to your home directory: + +.. code-block:: shell + + export SYCL_CACHE_PERSISTENT=1 # Cache SYCL kernel binaries. + +.. code-block:: python + + import open3d as o3d + o3d.core.sycl.enable_persistent_jit_cache() # Cache SYCL kernel binaries. + +For multi-GPU systems (e.g. with both integrated and discrete GPUs), the more +powerful GPU is automatically selected, as long as the correct GPU drivers and +SYCL runtime are installed. You can select a specific device with the +`ONEAPI_DEVICE_FILTER` or `SYCL_DEVICE_ALLOWLIST` `environment variables +`_. + + +.. code-block:: shell + + # Print all available devices (command line): + sycl-ls + # Examples: + export ONEAPI_DEVICE_SELECTOR="opencl:1" # Select the 2nd OpenCL device + + +.. code-block:: python + + # Print all available devices (Python): + import os os.environ["SYCL_DEVICE_ALLOWLIST"] = "BackendName:cuda" # Select CUDA GPU + import open3d as o3d + o3d.core.sycl.print_sycl_devices(print_all=true) + + # Return a list of available devices. + o3d.core.sycl.get_available_device() + + # Check if a device is available + o3d.core.sycl.is_available(o3d.core.Device("SYCL:0")) + + +Building from source +--------------------- + +You can build the binaries from source as shown below. To build for a different +Python version, set the `PYTHON_VERSION` variable in `docker/docker_build.sh`. + +.. code-block:: shell + + cd docker + ./docker_build.sh sycl-shared + +This will create the Python wheel and C++ binary archive in the current +directory. + +You can directly compile for a specific target device (i.e. ahead of time or AOT +compilation) using the OPEN3D_SYCL_TARGETS (`-fsycl-target` compiler option) and +OPEN3D_SYCL_TARGET_BACKEND_OPTIONS (`-Xs` compiler option) CMake variables in +Open3D. See the `compiler documentation +`_ for +information about building for specific hardware. + +if you want to use different settings (e.g. AOT compilation for a specific +device, or build a wheel for a different Python version), you can update the +``docker_build.sh`` script, or build directly on host after installing the +``intel-basekit`` or ``intel-cpp-essentials`` Debian packages from the Intel +OneAPI repository. \ No newline at end of file diff --git a/python/requirements.txt b/python/requirements.txt index a7da4963446..66db20516fb 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,4 +1,4 @@ -numpy>=1.18.0,<2.0.0 +numpy>=1.18.0 dash>=2.6.0 werkzeug>=3.0.0 flask>=3.0.0 diff --git a/python/requirements_sycl.txt b/python/requirements_sycl.txt new file mode 100644 index 00000000000..31f4c8a05cb --- /dev/null +++ b/python/requirements_sycl.txt @@ -0,0 +1 @@ +dpcpp-cpp-rt==2024.1.0 \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index 341d91f361c..b2be5b93af7 100644 --- a/python/setup.py +++ b/python/setup.py @@ -154,7 +154,8 @@ def finalize_options(self): long_description = readme.read() # open3d-cpu wheel for Linux x86_64 if sys.platform.startswith("linux") and platform.machine() in ( - 'i386', 'x86_64', 'AMD64') and "@BUILD_CUDA_MODULE@" == "OFF": + 'i386', 'x86_64', 'AMD64' +) and "@BUILD_CUDA_MODULE@" == "OFF" and "@BUILD_SYCL_MODULE@" == "OFF": name += "-cpu" long_description += ("\n\nThis wheel only contains CPU functionality. " "Use the open3d wheel for full functionality.") diff --git a/python/test/core/test_core.py b/python/test/core/test_core.py index c94a961e1fe..2f344381a57 100644 --- a/python/test/core/test_core.py +++ b/python/test/core/test_core.py @@ -68,7 +68,7 @@ def to_numpy_dtype(dtype: o3c.Dtype): @pytest.mark.parametrize("dtype", list_dtypes()) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_creation(dtype, device): # Shape takes tuple, list or o3c.SizeVector t = o3c.Tensor.empty((2, 3), dtype, device=device) @@ -95,7 +95,7 @@ def test_creation(dtype, device): @pytest.mark.parametrize("shape", [(), (0,), (1,), (0, 2), (0, 0, 2), (2, 0, 3)]) @pytest.mark.parametrize("dtype", list_dtypes()) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_creation_special_shapes(shape, dtype, device): o3_t = o3c.Tensor.full(shape, 3.14, dtype, device=device) np_t = np.full(shape, 3.14, dtype=to_numpy_dtype(dtype)) @@ -128,7 +128,7 @@ def test_device(): @pytest.mark.parametrize("dtype", list_dtypes()) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_tensor_constructor(dtype, device): # Numpy array np_t = np.array([[0, 1, 2], [3, 4, 5]], dtype=to_numpy_dtype(dtype)) @@ -177,7 +177,7 @@ def test_tensor_constructor(dtype, device): np.testing.assert_equal(np_t, o3_t.cpu().numpy()) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_arange(device): # Full parameters. setups = [(0, 10, 1), (0, 10, 1), (0.0, 10.0, 2.0), (0.0, -10.0, -2.0)] @@ -225,7 +225,7 @@ def test_arange(device): np.testing.assert_equal(np_t, o3_t.cpu().numpy()) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_flatten(device): # Flatten 0-D tensor @@ -370,7 +370,7 @@ def test_flatten(device): @pytest.mark.parametrize("dtype", list_non_bool_dtypes()) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_append(dtype, device): # Appending 0-D. # 0-D can only be appended along axis = null. @@ -585,7 +585,7 @@ def get_dst_t(): @pytest.mark.parametrize("dtype", list_non_bool_dtypes()) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_binary_ew_ops(dtype, device): a = o3c.Tensor(np.array([4, 6, 8, 10, 12, 14]), dtype=dtype, device=device) b = o3c.Tensor(np.array([2, 3, 4, 5, 6, 7]), dtype=dtype, device=device) @@ -613,7 +613,7 @@ def test_binary_ew_ops(dtype, device): np.testing.assert_equal(a.cpu().numpy(), np.array([2, 2, 2, 2, 2, 2])) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_to(device): a = o3c.Tensor(np.array([0.1, 1.2, 2.3, 3.4, 4.5, 5.6]).astype(np.float32), device=device) @@ -625,7 +625,7 @@ def test_to(device): assert b.device == a.device -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_unary_ew_ops(device): src_vals = np.array([0, 1, 2, 3, 4, 5]).astype(np.float32) src = o3c.Tensor(src_vals, device=device) @@ -658,7 +658,7 @@ def test_unary_ew_ops(device): atol=atol) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_getitem(device): np_t = np.array(range(24)).reshape((2, 3, 4)) o3_t = o3c.Tensor(np_t, device=device) @@ -700,7 +700,7 @@ def test_getitem(device): o3c.Tensor.ones((), device=device)[0:1] -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_setitem(device): np_ref = np.array(range(24)).reshape((2, 3, 4)) @@ -821,7 +821,7 @@ def test_setitem(device): "dim", [0, 1, 2, (), (0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2), None]) @pytest.mark.parametrize("keepdim", [True, False]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_reduction_sum(dim, keepdim, device): np_src = np.array(range(24)).reshape((2, 3, 4)) o3_src = o3c.Tensor(np_src, device=device) @@ -840,7 +840,7 @@ def test_reduction_sum(dim, keepdim, device): ((0, 2), (1)), ]) @pytest.mark.parametrize("keepdim", [True, False]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_reduction_special_shapes(shape_and_axis, keepdim, device): shape, axis = shape_and_axis np_src = np.array(np.random.rand(*shape)) @@ -856,7 +856,7 @@ def test_reduction_special_shapes(shape_and_axis, keepdim, device): "dim", [0, 1, 2, (), (0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2), None]) @pytest.mark.parametrize("keepdim", [True, False]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_reduction_mean(dim, keepdim, device): np_src = np.array(range(24)).reshape((2, 3, 4)).astype(np.float32) o3_src = o3c.Tensor(np_src, device=device) @@ -870,7 +870,7 @@ def test_reduction_mean(dim, keepdim, device): "dim", [0, 1, 2, (), (0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2), None]) @pytest.mark.parametrize("keepdim", [True, False]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_reduction_prod(dim, keepdim, device): np_src = np.array(range(24)).reshape((2, 3, 4)) o3_src = o3c.Tensor(np_src, device=device) @@ -884,7 +884,7 @@ def test_reduction_prod(dim, keepdim, device): "dim", [0, 1, 2, (), (0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2), None]) @pytest.mark.parametrize("keepdim", [True, False]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_reduction_min(dim, keepdim, device): np_src = np.array(range(24)) np.random.shuffle(np_src) @@ -900,7 +900,7 @@ def test_reduction_min(dim, keepdim, device): "dim", [0, 1, 2, (), (0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2), None]) @pytest.mark.parametrize("keepdim", [True, False]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_reduction_max(dim, keepdim, device): np_src = np.array(range(24)) np.random.shuffle(np_src) @@ -913,7 +913,7 @@ def test_reduction_max(dim, keepdim, device): @pytest.mark.parametrize("dim", [0, 1, 2, None]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_reduction_argmin_argmax(dim, device): np_src = np.array(range(24)) np.random.shuffle(np_src) @@ -929,7 +929,7 @@ def test_reduction_argmin_argmax(dim, device): np.testing.assert_allclose(o3_dst.cpu().numpy(), np_dst) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_advanced_index_get_mixed(device): np_src = np.array(range(24)).reshape((2, 3, 4)) o3_src = o3c.Tensor(np_src, device=device) @@ -955,7 +955,7 @@ def test_advanced_index_get_mixed(device): np.testing.assert_equal(o3_dst.cpu().numpy(), np_dst) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_advanced_index_set_mixed(device): np_src = np.array(range(24)).reshape((2, 3, 4)) o3_src = o3c.Tensor(np_src, device=device) @@ -988,7 +988,7 @@ def test_advanced_index_set_mixed(device): ("ceil", "ceil"), ("round", "round"), ("trunc", "trunc")]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_unary_elementwise(np_func_name, o3_func_name, device): np_t = np.array([-3.4, -2.6, -1.5, 0, 1.4, 2.6, 3.5]).astype(np.float32) o3_t = o3c.Tensor(np_t, device=device) @@ -1010,7 +1010,7 @@ def test_unary_elementwise(np_func_name, o3_func_name, device): atol=1e-7) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_logical_ops(device): np_a = np.array([True, False, True, False]) np_b = np.array([True, True, False, False]) @@ -1030,7 +1030,7 @@ def test_logical_ops(device): np.testing.assert_equal(o3_r.cpu().numpy(), np_r) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_comparision_ops(device): np_a = np.array([0, 1, -1]) np_b = np.array([0, 0, 0]) @@ -1045,7 +1045,7 @@ def test_comparision_ops(device): np.testing.assert_equal((o3_a != o3_b).cpu().numpy(), np_a != np_b) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_non_zero(device): np_x = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]]) np_nonzero_tuple = np.nonzero(np_x) @@ -1055,7 +1055,7 @@ def test_non_zero(device): np.testing.assert_equal(np_t, o3_t.cpu().numpy()) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_boolean_advanced_indexing(device): np_a = np.array([1, -1, -2, 3]) o3_a = o3c.Tensor(np_a, device=device) @@ -1140,7 +1140,7 @@ def test_boolean_advanced_indexing(device): device=device) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_scalar_op(device): # + a = o3c.Tensor.ones((2, 3), o3c.float32, device=device) @@ -1390,7 +1390,7 @@ def test_scalar_op(device): np.testing.assert_equal(a.cpu().numpy(), np.array([1, 0, 1])) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_all_any(device): a = o3c.Tensor([False, True, True, True], dtype=o3c.bool, device=device) assert not a.all() @@ -1405,7 +1405,7 @@ def test_all_any(device): assert not a.any() -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_allclose_isclose(device): a = o3c.Tensor([1, 2], device=device) b = o3c.Tensor([1, 3], device=device) @@ -1430,7 +1430,7 @@ def test_allclose_isclose(device): assert not a.allclose(b) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_issame(device): dtype = o3c.float32 a = o3c.Tensor.ones((2, 3), dtype, device=device) @@ -1448,7 +1448,7 @@ def test_issame(device): assert d.issame(e) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_item(device): o3_t = o3c.Tensor.ones((2, 3), dtype=o3c.float32, device=device) * 1.5 assert o3_t[0, 0].item() == 1.5 @@ -1471,7 +1471,7 @@ def test_item(device): assert isinstance(o3_t[0, 0].item(), bool) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_save_load(device): with tempfile.TemporaryDirectory() as temp_dir: file_name = f"{temp_dir}/tensor.npy" @@ -1535,7 +1535,7 @@ def test_save_load(device): np.testing.assert_equal(o3_t_load.cpu().numpy(), np_t) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_iterator(device): # 0-d. o3_t = o3c.Tensor.ones((), dtype=o3c.float32, device=device) @@ -1572,7 +1572,7 @@ def test_iterator(device): np.array([[0, 10, 20], [30, 40, 50]])) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_pickle(device): o3_t = o3c.Tensor.ones((100), dtype=o3c.float32, device=device) with tempfile.TemporaryDirectory() as temp_dir: diff --git a/python/test/core/test_linalg.py b/python/test/core/test_linalg.py index 7d1580570f8..7ab8908394f 100644 --- a/python/test/core/test_linalg.py +++ b/python/test/core/test_linalg.py @@ -17,7 +17,7 @@ from open3d_test import list_devices -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.int32, o3c.int64, o3c.float32, o3c.float64]) def test_matmul(device, dtype): @@ -71,7 +71,7 @@ def test_matmul(device, dtype): assert 'dimensions with zero' in str(excinfo.value) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.float32, o3c.float64]) def test_addmm(device, dtype): # Shape takes tuple, list or o3c.SizeVector @@ -141,7 +141,7 @@ def test_addmm(device, dtype): assert 'dimensions with zero' in str(excinfo.value) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.int32, o3c.int64, o3c.float32, o3c.float64]) def test_det(device, dtype): @@ -172,7 +172,7 @@ def test_det(device, dtype): assert 'must be square' in str(excinfo.value) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.int32, o3c.int64, o3c.float32, o3c.float64]) def test_lu(device, dtype): @@ -203,7 +203,7 @@ def test_lu(device, dtype): assert 'must be 2D' in str(excinfo.value) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.int32, o3c.int64, o3c.float32, o3c.float64]) def test_lu_ipiv(device, dtype): @@ -237,7 +237,7 @@ def test_lu_ipiv(device, dtype): assert 'must be 2D' in str(excinfo.value) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.int32, o3c.int64, o3c.float32, o3c.float64]) def test_inverse(device, dtype): @@ -289,7 +289,7 @@ def test_inverse(device, dtype): assert 'Singular matrix' in str(excinfo.value) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.int32, o3c.int64, o3c.float32, o3c.float64]) def test_svd(device, dtype): @@ -351,7 +351,7 @@ def test_svd(device, dtype): assert 'must be 2D' in str(excinfo.value) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.float32, o3c.float64]) def test_solve(device, dtype): # Test square @@ -369,7 +369,7 @@ def test_solve(device, dtype): assert 'singular' in str(excinfo.value) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.float32, o3c.float64]) def test_lstsq(device, dtype): # Test square @@ -416,7 +416,7 @@ def test_lstsq(device, dtype): a_shape[0], a_shape[1]) in str(excinfo.value) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.int32, o3c.int64, o3c.float32, o3c.float64]) def test_thiu(device, dtype): @@ -440,7 +440,7 @@ def test_thiu(device, dtype): atol=1e-5) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.int32, o3c.int64, o3c.float32, o3c.float64]) def test_thil(device, dtype): @@ -464,7 +464,7 @@ def test_thil(device, dtype): atol=1e-5) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) @pytest.mark.parametrize("dtype", [o3c.int32, o3c.int64, o3c.float32, o3c.float64]) def test_thiul(device, dtype): diff --git a/python/test/core/test_size_vector.py b/python/test/core/test_size_vector.py index 57ab9f8ab88..b4e80a7a1b5 100644 --- a/python/test/core/test_size_vector.py +++ b/python/test/core/test_size_vector.py @@ -67,7 +67,7 @@ def test_size_vector(): sv = o3d.core.SizeVector(["foo", "bar"]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_implicit_conversion(device): # Reshape t = o3d.core.Tensor.ones((3, 4), device=device) diff --git a/python/test/core/test_tensor_function.py b/python/test/core/test_tensor_function.py index 4a45b06f599..7f4390e8706 100644 --- a/python/test/core/test_tensor_function.py +++ b/python/test/core/test_tensor_function.py @@ -49,7 +49,7 @@ def list_non_bool_dtypes(): @pytest.mark.parametrize("dtype", list_non_bool_dtypes()) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_concatenate(dtype, device): # 0-D cannot be concatenated. @@ -200,7 +200,7 @@ def test_concatenate(dtype, device): @pytest.mark.parametrize("dtype", list_non_bool_dtypes()) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_append(dtype, device): # Appending 0-D. # 0-D can only be appended along axis = null. diff --git a/python/test/ml_ops/test_voxel_pooling.py b/python/test/ml_ops/test_voxel_pooling.py index a5198cdfc42..7fb85366571 100644 --- a/python/test/ml_ops/test_voxel_pooling.py +++ b/python/test/ml_ops/test_voxel_pooling.py @@ -80,7 +80,9 @@ def test_voxel_pooling(ml, pos_dtype, feat_dtype, position_fn, feature_fn): else: index = [1, 0] - np.testing.assert_allclose(ans.pooled_positions, expected_positions[index]) + np.testing.assert_allclose(ans.pooled_positions, + expected_positions[index], + rtol=1.1e-7) if feature_fn == 'average': if np.issubdtype(feat_dtype, np.integer): @@ -99,7 +101,9 @@ def test_voxel_pooling(ml, pos_dtype, feat_dtype, position_fn, feature_fn): elif feature_fn == 'nearest_neighbor': expected_features = np.array([features[0], features[3]]) - np.testing.assert_allclose(ans.pooled_features, expected_features[index]) + np.testing.assert_allclose(ans.pooled_features, + expected_features[index], + rtol=1.1e-7) @mltest.parametrize.ml_cpu_only diff --git a/python/test/t/geometry/test_lineset.py b/python/test/t/geometry/test_lineset.py index 2d3daceccaa..143c1a02159 100644 --- a/python/test/t/geometry/test_lineset.py +++ b/python/test/t/geometry/test_lineset.py @@ -35,7 +35,7 @@ def test_extrude_linear(): assert ans.triangle.indices.shape == (4, 3) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_pickle(device): line = o3d.t.geometry.LineSet([[0.7, 0, 0], [1, 0, 0]], [[0, 1]]).to(device) with tempfile.TemporaryDirectory() as temp_dir: diff --git a/python/test/t/geometry/test_pointcloud.py b/python/test/t/geometry/test_pointcloud.py index dc98b1db951..74f5d5f27c0 100644 --- a/python/test/t/geometry/test_pointcloud.py +++ b/python/test/t/geometry/test_pointcloud.py @@ -19,7 +19,7 @@ from open3d_test import list_devices -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_constructor_and_accessors(device): dtype = o3c.float32 @@ -46,7 +46,7 @@ def test_constructor_and_accessors(device): assert pcd.point.positions.allclose(o3c.Tensor([[1, 2, 3]], dtype, device)) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_from_legacy(device): dtype = o3c.float32 @@ -63,7 +63,7 @@ def test_from_legacy(device): o3c.Tensor([[6, 7, 8], [9, 10, 11]], dtype, device)) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_to_legacy(device): dtype = o3c.float32 @@ -180,7 +180,7 @@ def test_extrude_linear(): assert ans.line.indices.shape == (1, 2) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_pickle(device): pcd = o3d.t.geometry.PointCloud(device) with tempfile.TemporaryDirectory() as temp_dir: diff --git a/python/test/t/geometry/test_tensormap.py b/python/test/t/geometry/test_tensormap.py index 255c1ef55b2..18a9b70aa07 100644 --- a/python/test/t/geometry/test_tensormap.py +++ b/python/test/t/geometry/test_tensormap.py @@ -22,7 +22,7 @@ class WrongType(): pass -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_tensormap(device): dtype = o3c.float32 @@ -103,7 +103,7 @@ def test_tensormap(device): assert primary_key == "positions" -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_tensormap_modify(device): # Assigning to the *elements* of an alias will change the value in the map. # This tests that the alias shares the same memory as the tensor in the map. @@ -170,7 +170,7 @@ def test_tensormap_modify(device): np.testing.assert_equal(tm.b.cpu().numpy(), [100]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_tensor_dict_modify(device): """ Same as test_tensormap_modify(), but we put Tensors in a python dict. @@ -301,7 +301,7 @@ def test_numpy_dict_modify(): np.testing.assert_equal(tm["b"], [100]) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_pickle(device): tm = o3d.t.geometry.TensorMap("positions") with tempfile.TemporaryDirectory() as temp_dir: diff --git a/python/test/t/geometry/test_trianglemesh.py b/python/test/t/geometry/test_trianglemesh.py index 88678e3f877..c74aeb67d26 100644 --- a/python/test/t/geometry/test_trianglemesh.py +++ b/python/test/t/geometry/test_trianglemesh.py @@ -34,7 +34,7 @@ def test_slice_plane(): assert slices.line.indices.shape == (9, 2) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_create_box(device): # Test with default parameters. box_default = o3d.t.geometry.TriangleMesh.create_box(device=device) @@ -118,7 +118,7 @@ def test_create_box(device): assert box_custom.triangle.indices.allclose(triangle_indices_custom) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_create_sphere(device): # Test with custom parameters. sphere_custom = o3d.t.geometry.TriangleMesh.create_sphere( @@ -180,7 +180,7 @@ def test_create_sphere(device): assert sphere_custom.triangle.indices.allclose(triangle_indices_custom) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_create_tetrahedron(device): # Test with custom parameters. tetrahedron_custom = o3d.t.geometry.TriangleMesh.create_tetrahedron( @@ -204,7 +204,7 @@ def test_create_tetrahedron(device): assert tetrahedron_custom.triangle.indices.allclose(triangle_indices_custom) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_create_octahedron(device): # Test with custom parameters. octahedron_custom = o3d.t.geometry.TriangleMesh.create_octahedron( @@ -242,7 +242,7 @@ def test_create_octahedron(device): assert octahedron_custom.triangle.indices.allclose(triangle_indices_custom) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_create_icosahedron(device): # Test with custom parameters. icosahedron_custom = o3d.t.geometry.TriangleMesh.create_icosahedron( @@ -298,7 +298,7 @@ def test_create_icosahedron(device): assert icosahedron_custom.triangle.indices.allclose(triangle_indices_custom) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_create_cylinder(device): # Test with custom parameters. cylinder_custom = o3d.t.geometry.TriangleMesh.create_cylinder( @@ -360,7 +360,7 @@ def test_create_cylinder(device): assert cylinder_custom.triangle.indices.allclose(triangle_indices_custom) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_create_cone(device): # Test with custom parameters. cone_custom = o3d.t.geometry.TriangleMesh.create_cone( @@ -404,7 +404,7 @@ def test_create_cone(device): assert cone_custom.triangle.indices.allclose(triangle_indices_custom) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_create_torus(device): # Test with custom parameters. torus_custom = o3d.t.geometry.TriangleMesh.create_torus( @@ -482,7 +482,7 @@ def test_create_torus(device): assert torus_custom.triangle.indices.allclose(triangle_indices_custom) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_create_arrow(device): # Test with custom parameters. arrow_custom = o3d.t.geometry.TriangleMesh.create_arrow( @@ -546,7 +546,7 @@ def test_create_arrow(device): assert arrow_custom.triangle.indices.allclose(triangle_indices_custom) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_create_mobius(device): # Test with custom parameters. mobius_custom = o3d.t.geometry.TriangleMesh.create_mobius( @@ -818,7 +818,7 @@ def test_extrude_linear(): assert ans.triangle.indices.shape == (8, 3) -@pytest.mark.parametrize("device", list_devices()) +@pytest.mark.parametrize("device", list_devices(enable_sycl=True)) def test_pickle(device): mesh = o3d.t.geometry.TriangleMesh.create_box().to(device) with tempfile.TemporaryDirectory() as temp_dir: diff --git a/util/ci_utils.sh b/util/ci_utils.sh index ce921111092..b6110110d7b 100644 --- a/util/ci_utils.sh +++ b/util/ci_utils.sh @@ -21,6 +21,7 @@ fi BUILD_TENSORFLOW_OPS=${BUILD_TENSORFLOW_OPS:-ON} BUILD_PYTORCH_OPS=${BUILD_PYTORCH_OPS:-ON} LOW_MEM_USAGE=${LOW_MEM_USAGE:-OFF} +BUILD_SYCL_MODULE=${BUILD_SYCL_MODULE:-OFF} # Dependency versions: # CUDA: see docker/docker_build.sh @@ -28,6 +29,7 @@ LOW_MEM_USAGE=${LOW_MEM_USAGE:-OFF} TENSORFLOW_VER="2.16.2" TORCH_VER="2.2.2" TORCH_REPO_URL="https://download.pytorch.org/whl/torch/" +TORCH_CXX11_URL="https://download.pytorch.org/whl/" # Python PIP_VER="24.3.1" PROTOBUF_VER="4.24.0" @@ -73,9 +75,12 @@ install_python_dependencies() { python -m pip install -U "$TF_ARCH_NAME"=="$TENSORFLOW_VER" # ML/requirements-tensorflow.txt fi if [ "$BUILD_PYTORCH_OPS" == "ON" ]; then # ML/requirements-torch.txt - if [[ "$OSTYPE" == "linux-gnu"* ]]; then - python -m pip install -U "${TORCH_GLNX}" -f "$TORCH_REPO_URL" tensorboard - + if [[ "$OSTYPE" == "linux-gnu"* && "$BUILD_SYCL_MODULE" == "OFF" ]]; then + python -m pip install -U "${TORCH_GLNX}" -f "$TORCH_REPO_URL" + python -m pip install tensorboard + elif [[ "$OSTYPE" == "linux-gnu"* && "$BUILD_SYCL_MODULE" == "ON" ]]; then + python -m pip install -U "${TORCH_GLNX}.cxx11.abi" -i "$TORCH_CXX11_URL" + python -m pip install tensorboard elif [[ "$OSTYPE" == "darwin"* ]]; then python -m pip install -U torch=="$TORCH_VER" -f "$TORCH_REPO_URL" tensorboard else