diff --git a/.github/workflows/ubuntu-sycl.yml b/.github/workflows/ubuntu-sycl.yml
index daad144ce08..bce4e90f4d7 100644
--- a/.github/workflows/ubuntu-sycl.yml
+++ b/.github/workflows/ubuntu-sycl.yml
@@ -49,6 +49,26 @@ jobs:
             docker/docker_test.sh sycl-static
           fi
 
+      - name: Upload Python wheel and C++ binary package to GitHub artifacts
+        if: ${{ matrix.BUILD_SHARED_LIBS == 'ON' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: open3d-sycl-linux-wheel-and-binary
+          path: |
+            open3d-*.whl
+            open3d-*.tar.xz
+          if-no-files-found: error
+      - name: Update devel release
+        if: ${{ github.ref == 'refs/heads/main' && matrix.BUILD_SHARED_LIBS == 'ON' }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          if [ ${{ matrix.BUILD_SHARED_LIBS }} == 'ON' ] ; then
+            gh release upload main-devel open3d-*.whl --clobber
+            gh release upload main-devel open3d-*.tar.xz --clobber
+          fi
+          gh release view main-devel
+
       - name: GCloud CLI auth
         if: ${{ github.ref == 'refs/heads/main' }}
         uses: 'google-github-actions/auth@v2'
diff --git a/3rdparty/find_dependencies.cmake b/3rdparty/find_dependencies.cmake
index 9beb7da33c7..a782265a553 100644
--- a/3rdparty/find_dependencies.cmake
+++ b/3rdparty/find_dependencies.cmake
@@ -1534,12 +1534,17 @@ open3d_import_3rdparty_library(3rdparty_uvatlas
 list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_uvatlas)
 
 
+# SYCL link options are specified here. Compile options are only applied to SYCL source files and are specified in cmake/Open3DSYCLTargetSources.cmake
 if(BUILD_SYCL_MODULE)
     add_library(3rdparty_sycl INTERFACE)
     target_link_libraries(3rdparty_sycl INTERFACE
         $<$<AND:$<CXX_COMPILER_ID:IntelLLVM>,$<NOT:$<LINK_LANGUAGE:ISPC>>>:sycl>)
     target_link_options(3rdparty_sycl INTERFACE
-        $<$<AND:$<CXX_COMPILER_ID:IntelLLVM>,$<NOT:$<LINK_LANGUAGE:ISPC>>>:-fsycl -fsycl-targets=spir64_x86_64>)
+        $<$<AND:$<CXX_COMPILER_ID:IntelLLVM>,$<NOT:$<LINK_LANGUAGE:ISPC>>>:-fsycl -fsycl-targets=${OPEN3D_SYCL_TARGETS}>)
+    if (OPEN3D_SYCL_TARGET_BACKEND_OPTIONS)
+        target_link_options(3rdparty_sycl INTERFACE
+            $<$<AND:$<CXX_COMPILER_ID:IntelLLVM>,$<NOT:$<LINK_LANGUAGE:ISPC>>>:-Xs ${OPEN3D_SYCL_TARGET_BACKEND_OPTIONS}>)
+    endif()
     if(NOT BUILD_SHARED_LIBS OR arg_PUBLIC)
         install(TARGETS 3rdparty_sycl EXPORT Open3DTargets)
     endif()
@@ -1565,8 +1570,12 @@ if(OPEN3D_USE_ONEAPI_PACKAGES)
         GROUPED
         INCLUDE_DIRS ${MKL_INCLUDE}/
         LIB_DIR      ${MKL_ROOT}/lib/intel64
-        LIBRARIES    mkl_intel_ilp64 mkl_tbb_thread mkl_core
+        LIBRARIES    $<$<BOOL:${BUILD_SYCL_MODULE}>:mkl_sycl> mkl_intel_ilp64 mkl_tbb_thread mkl_core
     )
+    if (BUILD_SYCL_MODULE)
+    # target_link_options(3rdparty_mkl INTERFACE "-Wl,-export-dynamic")
+        target_link_libraries(3rdparty_mkl INTERFACE OpenCL)
+    endif()
     # MKL definitions
     target_compile_options(3rdparty_mkl INTERFACE "$<$<PLATFORM_ID:Linux,Darwin>:$<$<COMPILE_LANGUAGE:CXX>:-m64>>")
     target_compile_definitions(3rdparty_mkl INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:MKL_ILP64>")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1bd4706288..354125dc01d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,6 +63,16 @@ else()
     option(STATIC_WINDOWS_RUNTIME "Use static (MT/MTd) Windows runtime"      ON )
 endif()
 option(BUILD_SYCL_MODULE          "Build SYCL module with Intel oneAPI"      OFF)
+if(BUILD_SYCL_MODULE)
+    set(OPEN3D_SYCL_TARGETS "spir64" CACHE STRING 
+    "SYCL targets: spir64 for JIT, or another for AOT compilation. See https://github.com/intel/llvm/blob/sycl/sycl/doc/UsersManual.md."
+)
+    set(OPEN3D_SYCL_TARGET_BACKEND_OPTIONS "" CACHE STRING 
+    "SYCL target backend options, e.g. to compile for a specific device. See https://github.com/intel/llvm/blob/sycl/sycl/doc/UsersManual.md."
+)
+    set(BUILD_ISPC_MODULE OFF CACHE BOOL "Build the ISPC module" FORCE)
+    set(BUILD_CUDA_MODULE OFF CACHE BOOL "Build the CUDA module" FORCE)
+endif()
 option(GLIBCXX_USE_CXX11_ABI      "Set -D_GLIBCXX_USE_CXX11_ABI=1"           ON )
 option(ENABLE_SYCL_UNIFIED_SHARED_MEMORY "Enable SYCL unified shared memory" OFF)
 if(BUILD_GUI AND (WIN32 OR UNIX AND NOT LINUX_AARCH64 AND NOT APPLE_AARCH64))
@@ -282,12 +292,6 @@ endif()
 if(BUILD_SYCL_MODULE AND NOT GLIBCXX_USE_CXX11_ABI)
     message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires GLIBCXX_USE_CXX11_ABI=ON")
 endif()
-if(BUILD_SYCL_MODULE AND BUILD_TENSORFLOW_OPS)
-    message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_TENSORFLOW_OPS=OFF")
-endif()
-if(BUILD_SYCL_MODULE AND BUILD_PYTORCH_OPS)
-    message(FATAL_ERROR "BUILD_SYCL_MODULE=ON requires BUILD_PYTORCH_OPS=OFF")
-endif()
 if(BUILD_SYCL_MODULE AND BUILD_CUDA_MODULE)
     message(FATAL_ERROR "BUILD_SYCL_MODULE and BUILD_SYCL_MODULE cannot be on at the same time for now.")
 endif()
diff --git a/cmake/Open3DSYCLTargetSources.cmake b/cmake/Open3DSYCLTargetSources.cmake
index 9b0220c10ab..3314cb24303 100644
--- a/cmake/Open3DSYCLTargetSources.cmake
+++ b/cmake/Open3DSYCLTargetSources.cmake
@@ -2,7 +2,8 @@
 #
 # When BUILD_SYCL_MODULE=ON, set SYCL-specific compile flags for the listed
 # source files and call target_sources(). If BUILD_SYCL_MODULE=OFF, this
-# function directly calls target_sources().
+# function directly calls target_sources(). For SYCL link options, see 
+# 3rdparty/find_dependencies.cmake
 #
 # Note: this is not a perfect forwarding to target_sources(), as it only support
 # limited set of arguments. See the example usage below.
@@ -31,7 +32,7 @@ function(open3d_sycl_target_sources target)
         if(BUILD_SYCL_MODULE)
             foreach(sycl_file IN LISTS arg_UNPARSED_ARGUMENTS)
                 set_source_files_properties(${sycl_file} PROPERTIES
-                    COMPILE_OPTIONS -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64_x86_64)
+                    COMPILE_OPTIONS "-fsycl;-fsycl-targets=${OPEN3D_SYCL_TARGETS}")
                 if(arg_VERBOSE)
                     message(STATUS "open3d_sycl_target_sources(${target}): marked ${sycl_file} as SYCL code")
                 endif()
diff --git a/cmake/Open3DSetGlobalProperties.cmake b/cmake/Open3DSetGlobalProperties.cmake
index 03e43212a8b..ce8ff8d217e 100644
--- a/cmake/Open3DSetGlobalProperties.cmake
+++ b/cmake/Open3DSetGlobalProperties.cmake
@@ -25,15 +25,16 @@ function(open3d_enable_strip target)
     endif()
 endfunction()
 
-# RPATH handling (for TBB DSO). Check current folder, one folder above and the lib sibling folder 
+# RPATH handling (for TBB DSO). Check current folder, one folder above and the lib sibling folder. 
+# Also check the Python virtual env /lib folder for 3rd party dependency libraries installed with `pip install`
 set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
 if (APPLE)
-# Add options to cover the various ways in which open3d shaed lib or apps can be installed wrt TBB DSO
-    set(CMAKE_INSTALL_RPATH "@loader_path;@loader_path/../;@loader_path/../lib/")
+# Add options to cover the various ways in which open3d shared lib or apps can be installed wrt TBB DSO
+    set(CMAKE_INSTALL_RPATH "@loader_path;@loader_path/../;@loader_path/../lib/;@loader_path/../../../../")
 # pybind with open3d shared lib is copied, not cmake-installed, so we need to add .. to build rpath 
     set(CMAKE_BUILD_RPATH "@loader_path/../")   
 elseif(UNIX)
-    set(CMAKE_INSTALL_RPATH "$ORIGIN;$ORIGIN/../;$ORIGIN/../lib/")
+    set(CMAKE_INSTALL_RPATH "$ORIGIN;$ORIGIN/../;$ORIGIN/../lib/;$ORIGIN/../../../../")
     set(CMAKE_BUILD_RPATH "$ORIGIN/../")
 endif()
 
diff --git a/cpp/apps/CMakeLists.txt b/cpp/apps/CMakeLists.txt
index a0efa960d3c..f0349b60961 100644
--- a/cpp/apps/CMakeLists.txt
+++ b/cpp/apps/CMakeLists.txt
@@ -130,6 +130,11 @@ macro(open3d_add_app_common SRC_DIR APP_NAME TARGET_NAME)
     target_link_libraries(${TARGET_NAME} PRIVATE Open3D::Open3D TBB::tbb ${ARGN})
     set_target_properties(${TARGET_NAME} PROPERTIES FOLDER "apps")
 
+    if (BUILD_SYCL_MODULE)
+        find_package(IntelSYCL REQUIRED)   # requires cmake>=3.25 on Windows
+        add_sycl_to_target(TARGET ${TARGET_NAME})
+    endif()
+
     open3d_link_3rdparty_libraries(${TARGET_NAME})
     open3d_show_and_abort_on_warning(${TARGET_NAME})
     open3d_set_global_properties(${TARGET_NAME})
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 89178ce97cd..997b3dc9ab5 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -14,6 +14,10 @@ if (BUILD_CUDA_MODULE)
     find_package(CUDAToolkit REQUIRED)
     target_link_libraries(benchmarks PRIVATE CUDA::cudart)
 endif()
+if (BUILD_SYCL_MODULE)
+    find_package(IntelSYCL REQUIRED)   # requires cmake>=3.25 on Windows
+    add_sycl_to_target(TARGET benchmarks)
+endif()
 
 open3d_show_and_abort_on_warning(benchmarks)
 open3d_set_global_properties(benchmarks)
diff --git a/cpp/open3d/core/CMakeLists.txt b/cpp/open3d/core/CMakeLists.txt
index c34cea3fc6d..210cc33058a 100644
--- a/cpp/open3d/core/CMakeLists.txt
+++ b/cpp/open3d/core/CMakeLists.txt
@@ -5,7 +5,6 @@ target_sources(core PRIVATE
     CUDAUtils.cpp
     Device.cpp
     Dtype.cpp
-    EigenConverter.cpp
     Indexer.cpp
     MemoryManager.cpp
     MemoryManagerCached.cpp
@@ -23,6 +22,7 @@ target_sources(core PRIVATE
 
 # Compile regardless BUILD_SYCL_MODULE == ON or OFF.
 open3d_sycl_target_sources(core PRIVATE
+    EigenConverter.cpp
     SYCLUtils.cpp
 )
 
@@ -43,22 +43,14 @@ target_sources(core PRIVATE
     hashmap/HashSet.cpp
     kernel/Kernel.cpp
     linalg/AddMM.cpp
-    linalg/AddMMCPU.cpp
     linalg/Det.cpp
     linalg/Inverse.cpp
-    linalg/InverseCPU.cpp
     linalg/LeastSquares.cpp
-    linalg/LeastSquaresCPU.cpp
     linalg/LU.cpp
-    linalg/LUCPU.cpp
     linalg/Matmul.cpp
-    linalg/MatmulCPU.cpp
     linalg/Solve.cpp
-    linalg/SolveCPU.cpp
     linalg/SVD.cpp
-    linalg/SVDCPU.cpp
     linalg/Tri.cpp
-    linalg/TriCPU.cpp
     nns/FixedRadiusIndex.cpp
     nns/FixedRadiusSearchOps.cpp
     nns/KnnIndex.cpp
@@ -73,21 +65,48 @@ set_target_properties(core_impl PROPERTIES CXX_VISIBILITY_PRESET "hidden")
 
 target_sources(core_impl PRIVATE
     kernel/Arange.cpp
-    kernel/ArangeCPU.cpp
     kernel/BinaryEW.cpp
-    kernel/BinaryEWCPU.cpp
     kernel/IndexGetSet.cpp
-    kernel/IndexGetSetCPU.cpp
     kernel/IndexReduction.cpp
-    kernel/IndexReductionCPU.cpp
     kernel/NonZero.cpp
-    kernel/NonZeroCPU.cpp
     kernel/Reduction.cpp
-    kernel/ReductionCPU.cpp
     kernel/UnaryEW.cpp
+    kernel/ArangeCPU.cpp
+    kernel/BinaryEWCPU.cpp
+    kernel/IndexGetSetCPU.cpp
+    kernel/IndexReductionCPU.cpp
+    kernel/NonZeroCPU.cpp
+    kernel/ReductionCPU.cpp
     kernel/UnaryEWCPU.cpp
+    linalg/AddMMCPU.cpp
+    linalg/InverseCPU.cpp
+    linalg/LeastSquaresCPU.cpp
+    linalg/LUCPU.cpp
+    linalg/MatmulCPU.cpp
+    linalg/SolveCPU.cpp
+    linalg/SVDCPU.cpp
+    linalg/TriCPU.cpp
+)
+
+if (BUILD_SYCL_MODULE)
+open3d_sycl_target_sources(core_impl PRIVATE
     kernel/UnaryEWSYCL.cpp
+    kernel/BinaryEWSYCL.cpp
+    kernel/ArangeSYCL.cpp
+    kernel/IndexGetSetSYCL.cpp
+    kernel/NonZeroSYCL.cpp
+    kernel/IndexReductionSYCL.cpp
+    kernel/ReductionSYCL.cpp
+    linalg/AddMMSYCL.cpp
+    linalg/InverseSYCL.cpp
+    linalg/LeastSquaresSYCL.cpp
+    linalg/LUSYCL.cpp
+    linalg/MatmulSYCL.cpp
+    linalg/SolveSYCL.cpp
+    linalg/SVDSYCL.cpp
+    linalg/TriSYCL.cpp
 )
+endif()
 
 if (BUILD_CUDA_MODULE)
     target_sources(core PRIVATE
diff --git a/cpp/open3d/core/Device.h b/cpp/open3d/core/Device.h
index a79d9cf646c..d215d16e55e 100644
--- a/cpp/open3d/core/Device.h
+++ b/cpp/open3d/core/Device.h
@@ -115,4 +115,12 @@ struct hash<open3d::core::Device> {
         return std::hash<std::string>{}(device.ToString());
     }
 };
+
+template <>
+struct less<open3d::core::Device> {
+    bool operator()(const open3d::core::Device& lhs,
+                    const open3d::core::Device& rhs) const {
+        return lhs.ToString() < rhs.ToString();
+    }
+};
 }  // namespace std
diff --git a/cpp/open3d/core/Indexer.h b/cpp/open3d/core/Indexer.h
index c9eb8933d49..298997117b5 100644
--- a/cpp/open3d/core/Indexer.h
+++ b/cpp/open3d/core/Indexer.h
@@ -34,11 +34,11 @@ class Indexer;
 class IndexerIterator;
 
 // Maximum number of dimensions of TensorRef.
-static constexpr int64_t MAX_DIMS = 10;
+static constexpr int64_t MAX_DIMS = 5;
 
 // Maximum number of inputs of an op.
 // MAX_INPUTS shall be >= MAX_DIMS to support advanced indexing.
-static constexpr int64_t MAX_INPUTS = 10;
+static constexpr int64_t MAX_INPUTS = 5;
 
 // Maximum number of outputs of an op. This number can be increased when
 // necessary.
@@ -110,7 +110,7 @@ struct TensorRef {
 
     TensorRef(const Tensor& t) {
         if (t.NumDims() > MAX_DIMS) {
-            utility::LogError("Tenor has too many dimensions {} > {}.",
+            utility::LogError("Tensor has too many dimensions {} > {}.",
                               t.NumDims(), MAX_DIMS);
         }
         data_ptr_ = const_cast<void*>(t.GetDataPtr());
diff --git a/cpp/open3d/core/Indexer.isph b/cpp/open3d/core/Indexer.isph
index c9b63486377..af7629d3f06 100644
--- a/cpp/open3d/core/Indexer.isph
+++ b/cpp/open3d/core/Indexer.isph
@@ -11,11 +11,11 @@
 #include "open3d/utility/Helper.isph"
 
 // Maximum number of dimensions of TensorRef.
-enum { MAX_DIMS = 10 };
+enum { MAX_DIMS = 4 };
 
 // Maximum number of inputs of an op.
 // MAX_INPUTS shall be >= MAX_DIMS to support advanced indexing.
-enum { MAX_INPUTS = 10 };
+enum { MAX_INPUTS = 4 };
 
 // Maximum number of outputs of an op. This number can be increased when
 // necessary.
diff --git a/cpp/open3d/core/ParallelFor.h b/cpp/open3d/core/ParallelFor.h
index 2d5bef78812..9e917789947 100644
--- a/cpp/open3d/core/ParallelFor.h
+++ b/cpp/open3d/core/ParallelFor.h
@@ -99,6 +99,11 @@ void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
 /// \note If you use a lambda function, capture only the required variables
 /// instead of all to prevent accidental race conditions. If you want the
 /// kernel to be used on both CPU and CUDA, capture the variables by value.
+/// \note This does not dispatch to SYCL, since SYCL has extra constraints:
+///      - Lambdas may capture by value only.
+///      - No function pointers / virtual functions.
+/// Auto dispatch to SYCL will enforce these conditions even on CPU devices. Use
+/// ParallelForSYCL instead.
 template <typename func_t>
 void ParallelFor(const Device& device, int64_t n, const func_t& func) {
 #ifdef __CUDACC__
diff --git a/cpp/open3d/core/ParallelForSYCL.h b/cpp/open3d/core/ParallelForSYCL.h
new file mode 100644
index 00000000000..d74d4853734
--- /dev/null
+++ b/cpp/open3d/core/ParallelForSYCL.h
@@ -0,0 +1,63 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include "open3d/core/Device.h"
+#include "open3d/core/Indexer.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+
+/// Run a function in parallel with SYCL.
+template <typename Functor, typename... FuncArgs>
+void ParallelForSYCL(const Device& device,
+                     Indexer indexer,
+                     FuncArgs... func_args) {
+    if (!device.IsSYCL()) {
+        utility::LogError("ParallelFor for SYCL cannot run on device {}.",
+                          device.ToString());
+    }
+    int64_t n = indexer.NumWorkloads();
+    if (n == 0) {
+        return;
+    }
+    auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    /// TODO: Specify grid size based on device properties
+    queue.parallel_for<Functor>(n, [indexer, func_args...](int64_t i) {
+             Functor ef(indexer, func_args...);
+             ef(i);
+         }).wait_and_throw();
+}
+
+/// Run a function in parallel with SYCL.
+template <typename Functor, typename... FuncArgs>
+void ParallelForSYCL(const Device& device,
+                     int64_t num_workloads,
+                     FuncArgs... func_args) {
+    if (!device.IsSYCL()) {
+        utility::LogError("ParallelFor for SYCL cannot run on device {}.",
+                          device.ToString());
+    }
+    if (num_workloads == 0) {
+        return;
+    }
+    auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    /// TODO: Specify grid size based on device properties
+    queue.parallel_for<Functor>(num_workloads, [func_args...](int64_t i) {
+             Functor ef(func_args...);
+             ef(i);
+         }).wait_and_throw();
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/SYCLContext.cpp b/cpp/open3d/core/SYCLContext.cpp
index 30c5f6f6afc..55115746b35 100644
--- a/cpp/open3d/core/SYCLContext.cpp
+++ b/cpp/open3d/core/SYCLContext.cpp
@@ -19,6 +19,8 @@ namespace open3d {
 namespace core {
 namespace sy {
 
+OPEN3D_DLL_LOCAL std::string GetDeviceTypeName(const sycl::device &device);
+
 SYCLContext &SYCLContext::GetInstance() {
     static thread_local SYCLContext instance;
     return instance;
@@ -27,19 +29,45 @@ SYCLContext &SYCLContext::GetInstance() {
 bool SYCLContext::IsAvailable() { return devices_.size() > 0; }
 
 bool SYCLContext::IsDeviceAvailable(const Device &device) {
-    bool rc = false;
-    for (const Device &device_ : devices_) {
-        if (device == device_) {
-            rc = true;
-            break;
-        }
+    return devices_.find(device) != devices_.end();
+}
+std::vector<Device> SYCLContext::GetAvailableSYCLDevices() {
+    std::vector<Device> device_vec;
+    for (const auto &device : devices_) {
+        device_vec.push_back(device.first);
     }
-    return rc;
+    return device_vec;
 }
-std::vector<Device> SYCLContext::GetAvailableSYCLDevices() { return devices_; }
 
 sycl::queue SYCLContext::GetDefaultQueue(const Device &device) {
-    return device_to_default_queue_.at(device);
+    return devices_.at(device).queue;
+}
+
+SYCLDevice::SYCLDevice(const sycl::device &sycl_device) {
+    namespace sid = sycl::info::device;
+    device = sycl_device;
+    queue = sycl::queue(device);
+    name = device.get_info<sid::name>();
+    device_type = GetDeviceTypeName(device);
+    max_work_group_size = device.get_info<sid::max_work_group_size>();
+    auto aspects = device.get_info<sid::aspects>();
+    fp64 = std::find(aspects.begin(), aspects.end(), sycl::aspect::fp64) !=
+           aspects.end();
+    if (!fp64) {
+        utility::LogWarning(
+                "SYCL device {} does not support double precision. Using "
+                "emulation.",
+                name);
+    }
+    usm_device_allocations =
+            std::find(aspects.begin(), aspects.end(),
+                      sycl::aspect::usm_device_allocations) != aspects.end();
+    if (!usm_device_allocations) {
+        utility::LogWarning(
+                "SYCL device {} does not support USM device allocations. "
+                "Open3D SYCL support may not work.",
+                name);
+    }
 }
 
 SYCLContext::SYCLContext() {
@@ -48,9 +76,7 @@ SYCLContext::SYCLContext() {
     try {
         const sycl::device &sycl_device = sycl::device(sycl::gpu_selector_v);
         const Device open3d_device = Device("SYCL:0");
-        devices_.push_back(open3d_device);
-        device_to_sycl_device_[open3d_device] = sycl_device;
-        device_to_default_queue_[open3d_device] = sycl::queue(sycl_device);
+        devices_.emplace(open3d_device, sycl_device);
     } catch (const sycl::exception &e) {
     }
 
@@ -66,9 +92,7 @@ SYCLContext::SYCLContext() {
         const sycl::device &sycl_device = sycl::device(sycl::cpu_selector_v);
         const Device open3d_device =
                 Device("SYCL:" + std::to_string(devices_.size()));
-        devices_.push_back(open3d_device);
-        device_to_sycl_device_[open3d_device] = sycl_device;
-        device_to_default_queue_[open3d_device] = sycl::queue(sycl_device);
+        devices_.emplace(open3d_device, sycl_device);
     } catch (const sycl::exception &e) {
     }
 
diff --git a/cpp/open3d/core/SYCLContext.h b/cpp/open3d/core/SYCLContext.h
index 341da29369e..c1a523f17e3 100644
--- a/cpp/open3d/core/SYCLContext.h
+++ b/cpp/open3d/core/SYCLContext.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include <map>
 #include <sycl/sycl.hpp>
-#include <unordered_map>
 
 #include "open3d/core/Device.h"
 
@@ -23,6 +23,19 @@ namespace open3d {
 namespace core {
 namespace sy {
 
+/// @brief SYCL device properties.
+struct SYCLDevice {
+    SYCLDevice(const sycl::device& sycl_device);
+    std::string name;            ///< Fiendlly / descriptive name of the device.
+    std::string device_type;     ///< cpu, gpu, host, acc, custom, unknown.
+    sycl::device device;         ///< SYCL device.
+    sycl::queue queue;           ///< Default queue for this device.
+    size_t max_work_group_size;  ///< Preferred work group size
+    bool fp64;  ///< Double precision support, else need to emulate.
+    bool usm_device_allocations;  ///< USM device allocations required for
+                                  ///< Open3D.
+};
+
 /// Singleton SYCL context manager. It maintains:
 /// - A default queue for each SYCL device
 class SYCLContext {
@@ -45,17 +58,16 @@ class SYCLContext {
     /// Get the default SYCL queue given an Open3D device.
     sycl::queue GetDefaultQueue(const Device& device);
 
+    /// Get SYCL device properties given an Open3D device.
+    SYCLDevice GetDeviceProperties(const Device& device) {
+        return devices_.at(device);
+    };
+
 private:
     SYCLContext();
 
-    /// List of available Open3D SYCL devices.
-    std::vector<Device> devices_;
-
-    /// Maps core::Device to the corresponding default SYCL queue.
-    std::unordered_map<Device, sycl::queue> device_to_default_queue_;
-
-    /// Maps core::Device to sycl::device. Internal use only for now.
-    std::unordered_map<Device, sycl::device> device_to_sycl_device_;
+    /// Map from available Open3D SYCL devices to their properties.
+    std::map<Device, SYCLDevice> devices_;
 };
 
 }  // namespace sy
diff --git a/cpp/open3d/core/SYCLUtils.cpp b/cpp/open3d/core/SYCLUtils.cpp
index 011ee5eb653..3a62730239d 100644
--- a/cpp/open3d/core/SYCLUtils.cpp
+++ b/cpp/open3d/core/SYCLUtils.cpp
@@ -84,7 +84,7 @@ int SYCLDemo() {
 
 #ifdef BUILD_SYCL_MODULE
 
-static std::string GetDeviceTypeName(const sycl::device &device) {
+OPEN3D_DLL_LOCAL std::string GetDeviceTypeName(const sycl::device &device) {
     auto device_type = device.get_info<sycl::info::device::device_type>();
     switch (device_type) {
         case sycl::info::device_type::cpu:
@@ -95,6 +95,8 @@ static std::string GetDeviceTypeName(const sycl::device &device) {
             return "host";
         case sycl::info::device_type::accelerator:
             return "acc";
+        case sycl::info::device_type::custom:
+            return "custom";
         default:
             return "unknown";
     }
@@ -210,6 +212,20 @@ bool IsDeviceAvailable(const Device &device) {
 #endif
 }
 
+std::string GetDeviceType(const Device &device) {
+#ifdef BUILD_SYCL_MODULE
+    if (IsDeviceAvailable(device)) {
+        return SYCLContext::GetInstance()
+                .GetDeviceProperties(device)
+                .device_type;
+    } else {
+        return "";
+    }
+#else
+    return "";
+#endif
+}
+
 std::vector<Device> GetAvailableSYCLDevices() {
 #ifdef BUILD_SYCL_MODULE
     return SYCLContext::GetInstance().GetAvailableSYCLDevices();
diff --git a/cpp/open3d/core/SYCLUtils.h b/cpp/open3d/core/SYCLUtils.h
index c6e34eefe06..72d16a3bbaf 100644
--- a/cpp/open3d/core/SYCLUtils.h
+++ b/cpp/open3d/core/SYCLUtils.h
@@ -37,6 +37,10 @@ bool IsAvailable();
 /// Returns true if the specified SYCL device is available.
 bool IsDeviceAvailable(const Device& device);
 
+/// Returns the device type (cpu / gpu / accelerator / custom) of the specified
+/// device as a string. Returns empty string if the device is not available.
+std::string GetDeviceType(const Device& device);
+
 /// Return a list of available SYCL devices.
 std::vector<Device> GetAvailableSYCLDevices();
 
diff --git a/cpp/open3d/core/Tensor.h b/cpp/open3d/core/Tensor.h
index 7a20da6b5b2..445676d59ce 100644
--- a/cpp/open3d/core/Tensor.h
+++ b/cpp/open3d/core/Tensor.h
@@ -1329,7 +1329,7 @@ class Tensor : public IsDevice {
 
     /// Underlying memory buffer for Tensor.
     std::shared_ptr<Blob> blob_ = nullptr;
-};  // namespace core
+};
 
 template <>
 inline Tensor::Tensor(const std::vector<bool>& init_vals,
@@ -1425,5 +1425,11 @@ inline Tensor operator/(T scalar_lhs, const Tensor& rhs) {
     return Tensor::Full({}, scalar_lhs, rhs.GetDtype(), rhs.GetDevice()) / rhs;
 }
 
+inline void AssertNotSYCL(const Tensor& tensor) {
+    if (tensor.GetDevice().IsSYCL()) {
+        utility::LogError("Not supported for SYCL device.");
+    }
+}
+
 }  // namespace core
-}  // namespace open3d
+}  // namespace open3d
\ No newline at end of file
diff --git a/cpp/open3d/core/kernel/Arange.cpp b/cpp/open3d/core/kernel/Arange.cpp
index b3385a2a10c..3c85401eece 100644
--- a/cpp/open3d/core/kernel/Arange.cpp
+++ b/cpp/open3d/core/kernel/Arange.cpp
@@ -63,6 +63,12 @@ Tensor Arange(const Tensor& start, const Tensor& stop, const Tensor& step) {
 
     if (device.IsCPU()) {
         ArangeCPU(start, stop, step, dst);
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ArangeSYCL(start, stop, step, dst);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else if (device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         ArangeCUDA(start, stop, step, dst);
diff --git a/cpp/open3d/core/kernel/Arange.h b/cpp/open3d/core/kernel/Arange.h
index 1a0e88323fd..4547960b5f4 100644
--- a/cpp/open3d/core/kernel/Arange.h
+++ b/cpp/open3d/core/kernel/Arange.h
@@ -20,6 +20,13 @@ void ArangeCPU(const Tensor& start,
                const Tensor& step,
                Tensor& dst);
 
+#ifdef BUILD_SYCL_MODULE
+void ArangeSYCL(const Tensor& start,
+                const Tensor& stop,
+                const Tensor& step,
+                Tensor& dst);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void ArangeCUDA(const Tensor& start,
                 const Tensor& stop,
diff --git a/cpp/open3d/core/kernel/ArangeSYCL.cpp b/cpp/open3d/core/kernel/ArangeSYCL.cpp
new file mode 100644
index 00000000000..cda8912be90
--- /dev/null
+++ b/cpp/open3d/core/kernel/ArangeSYCL.cpp
@@ -0,0 +1,37 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/kernel/Arange.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+void ArangeSYCL(const Tensor& start,
+                const Tensor& stop,
+                const Tensor& step,
+                Tensor& dst) {
+    Dtype dtype = start.GetDtype();
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(start.GetDevice());
+    DISPATCH_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        scalar_t sstart = start.Item<scalar_t>();
+        scalar_t sstep = step.Item<scalar_t>();
+        scalar_t* dst_ptr = dst.GetDataPtr<scalar_t>();
+        int64_t n = dst.GetLength();
+        queue.parallel_for(n, [=](int64_t i) {
+                 dst_ptr[i] = sstart + static_cast<scalar_t>(sstep * i);
+             }).wait_and_throw();
+    });
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/BinaryEW.cpp b/cpp/open3d/core/kernel/BinaryEW.cpp
index 00eb9b388e5..e34122cd137 100644
--- a/cpp/open3d/core/kernel/BinaryEW.cpp
+++ b/cpp/open3d/core/kernel/BinaryEW.cpp
@@ -51,6 +51,12 @@ void BinaryEW(const Tensor& lhs,
 
     if (lhs.IsCPU()) {
         BinaryEWCPU(lhs, rhs, dst, op_code);
+    } else if (lhs.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        BinaryEWSYCL(lhs, rhs, dst, op_code);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else if (lhs.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         BinaryEWCUDA(lhs, rhs, dst, op_code);
diff --git a/cpp/open3d/core/kernel/BinaryEW.h b/cpp/open3d/core/kernel/BinaryEW.h
index fbd4af3e849..2f2cb57888e 100644
--- a/cpp/open3d/core/kernel/BinaryEW.h
+++ b/cpp/open3d/core/kernel/BinaryEW.h
@@ -48,6 +48,13 @@ void BinaryEWCPU(const Tensor& lhs,
                  Tensor& dst,
                  BinaryEWOpCode op_code);
 
+#ifdef BUILD_SYCL_MODULE
+void BinaryEWSYCL(const Tensor& lhs,
+                  const Tensor& rhs,
+                  Tensor& dst,
+                  BinaryEWOpCode op_code);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void BinaryEWCUDA(const Tensor& lhs,
                   const Tensor& rhs,
diff --git a/cpp/open3d/core/kernel/BinaryEWSYCL.cpp b/cpp/open3d/core/kernel/BinaryEWSYCL.cpp
new file mode 100644
index 00000000000..1466b11f97e
--- /dev/null
+++ b/cpp/open3d/core/kernel/BinaryEWSYCL.cpp
@@ -0,0 +1,270 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/Dtype.h"
+#include "open3d/core/Indexer.h"
+#include "open3d/core/MemoryManager.h"
+#include "open3d/core/ParallelForSYCL.h"
+#include "open3d/core/SizeVector.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/kernel/BinaryEW.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+namespace {
+
+struct BinaryElementKernel {
+    void operator()(int64_t i) {}
+    BinaryElementKernel(Indexer indexer_) : indexer(indexer_) {}
+
+protected:
+    Indexer indexer;
+};
+
+// Min, Max
+#define BINARY_ELEMENT_KERNEL(name, elem_fn)                     \
+    template <typename src_t, typename dst_t = src_t>            \
+    struct name##ElementKernel : public BinaryElementKernel {    \
+        using BinaryElementKernel::BinaryElementKernel;          \
+        void operator()(int64_t i) {                             \
+            const src_t* lhs = indexer.GetInputPtr<src_t>(0, i); \
+            const src_t* rhs = indexer.GetInputPtr<src_t>(1, i); \
+            dst_t* dst = indexer.GetOutputPtr<dst_t>(i);         \
+            *dst = elem_fn(*lhs, *rhs);                          \
+        }                                                        \
+    }
+
+BINARY_ELEMENT_KERNEL(Max, sycl::max);
+BINARY_ELEMENT_KERNEL(Min, sycl::min);
+#undef BINARY_ELEMENT_KERNEL
+
+/// Specialize Min, Max for Bool, since sycl::min, sycl::max do not support it.
+template <>
+struct MaxElementKernel<bool, bool> : public BinaryElementKernel {
+    using BinaryElementKernel::BinaryElementKernel;
+    void operator()(int64_t i) {
+        const bool* lhs = indexer.GetInputPtr<bool>(0, i);
+        const bool* rhs = indexer.GetInputPtr<bool>(1, i);
+        bool* dst = indexer.GetOutputPtr<bool>(i);
+        *dst = *lhs || *rhs;
+    }
+};
+template <>
+struct MinElementKernel<bool, bool> : public BinaryElementKernel {
+    using BinaryElementKernel::BinaryElementKernel;
+    void operator()(int64_t i) {
+        const bool* lhs = indexer.GetInputPtr<bool>(0, i);
+        const bool* rhs = indexer.GetInputPtr<bool>(1, i);
+        bool* dst = indexer.GetOutputPtr<bool>(i);
+        *dst = *lhs && *rhs;
+    }
+};
+
+// Arithmetic and Relational ops.
+#define BINARY_ELEMENT_KERNEL(name, elem_op)                     \
+    template <typename src_t, typename dst_t = src_t>            \
+    struct name##ElementKernel : public BinaryElementKernel {    \
+        using BinaryElementKernel::BinaryElementKernel;          \
+        void operator()(int64_t i) {                             \
+            const src_t* lhs = indexer.GetInputPtr<src_t>(0, i); \
+            const src_t* rhs = indexer.GetInputPtr<src_t>(1, i); \
+            dst_t* dst = indexer.GetOutputPtr<dst_t>(i);         \
+            *dst = (*lhs)elem_op(*rhs);                          \
+        }                                                        \
+    }
+
+BINARY_ELEMENT_KERNEL(Add, +);
+BINARY_ELEMENT_KERNEL(Sub, -);
+BINARY_ELEMENT_KERNEL(Mul, *);
+BINARY_ELEMENT_KERNEL(Div, /);
+BINARY_ELEMENT_KERNEL(Gt, >);
+BINARY_ELEMENT_KERNEL(Lt, <);
+BINARY_ELEMENT_KERNEL(Geq, >=);
+BINARY_ELEMENT_KERNEL(Leq, <=);
+BINARY_ELEMENT_KERNEL(Eq, ==);
+BINARY_ELEMENT_KERNEL(Neq, !=);
+#undef BINARY_ELEMENT_KERNEL
+
+// Logical ops.
+#define BINARY_ELEMENT_KERNEL(name, elem_op)                                \
+    template <typename src_t, typename dst_t = src_t>                       \
+    struct name##ElementKernel : public BinaryElementKernel {               \
+        using BinaryElementKernel::BinaryElementKernel;                     \
+        void operator()(int64_t i) {                                        \
+            const src_t* lhs = indexer.GetInputPtr<src_t>(0, i);            \
+            const src_t* rhs = indexer.GetInputPtr<src_t>(1, i);            \
+            dst_t* dst = indexer.GetOutputPtr<dst_t>(i);                    \
+            *dst = static_cast<bool>(*lhs) elem_op static_cast<bool>(*rhs); \
+        }                                                                   \
+    }
+BINARY_ELEMENT_KERNEL(LogicalAnd, &&);
+BINARY_ELEMENT_KERNEL(LogicalOr, ||);
+BINARY_ELEMENT_KERNEL(LogicalXor, !=);
+#undef BINARY_ELEMENT_KERNEL
+
+}  // namespace
+
+void BinaryEWSYCL(const Tensor& lhs,
+                  const Tensor& rhs,
+                  Tensor& dst,
+                  BinaryEWOpCode op_code) {
+    Dtype src_dtype = lhs.GetDtype();
+    Dtype dst_dtype = dst.GetDtype();
+    Device device = lhs.GetDevice();
+
+    if (s_boolean_binary_ew_op_codes.find(op_code) !=
+        s_boolean_binary_ew_op_codes.end()) {
+        if (dst_dtype == src_dtype) {
+            // Inplace boolean op's output type is the same as the
+            // input. e.g. np.logical_and(a, b, out=a), where a, b are
+            // floats.
+            Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME);
+            DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+                switch (op_code) {
+                    case BinaryEWOpCode::LogicalAnd:
+                        ParallelForSYCL<LogicalAndElementKernel<scalar_t>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::LogicalOr:
+                        ParallelForSYCL<LogicalOrElementKernel<scalar_t>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::LogicalXor:
+                        ParallelForSYCL<LogicalXorElementKernel<scalar_t>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Gt:
+                        ParallelForSYCL<GtElementKernel<scalar_t>>(device,
+                                                                   indexer);
+                        break;
+                    case BinaryEWOpCode::Lt:
+                        ParallelForSYCL<LtElementKernel<scalar_t>>(device,
+                                                                   indexer);
+                        break;
+                    case BinaryEWOpCode::Ge:
+                        ParallelForSYCL<GeqElementKernel<scalar_t>>(device,
+                                                                    indexer);
+                        break;
+                    case BinaryEWOpCode::Le:
+                        ParallelForSYCL<LeqElementKernel<scalar_t>>(device,
+                                                                    indexer);
+                        break;
+                    case BinaryEWOpCode::Eq:
+                        ParallelForSYCL<EqElementKernel<scalar_t>>(device,
+                                                                   indexer);
+                        break;
+                    case BinaryEWOpCode::Ne:
+                        ParallelForSYCL<NeqElementKernel<scalar_t>>(device,
+                                                                    indexer);
+                        break;
+                    default:
+                        break;
+                }
+            });
+        } else if (dst_dtype == core::Bool) {
+            // By default, output is boolean type.
+            Indexer indexer({lhs, rhs}, dst,
+                            DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
+            DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+                switch (op_code) {
+                    case BinaryEWOpCode::LogicalAnd:
+                        ParallelForSYCL<
+                                LogicalAndElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::LogicalOr:
+                        ParallelForSYCL<LogicalOrElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::LogicalXor:
+                        ParallelForSYCL<
+                                LogicalXorElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Gt:
+                        ParallelForSYCL<GtElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Lt:
+                        ParallelForSYCL<LtElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Ge:
+                        ParallelForSYCL<GeqElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Le:
+                        ParallelForSYCL<LeqElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Eq:
+                        ParallelForSYCL<EqElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    case BinaryEWOpCode::Ne:
+                        ParallelForSYCL<NeqElementKernel<scalar_t, bool>>(
+                                device, indexer);
+                        break;
+                    default:
+                        break;
+                }
+            });
+        } else {
+            utility::LogError(
+                    "Boolean op's output type must be boolean or the "
+                    "same type as the input.");
+        }
+    } else if (op_code == BinaryEWOpCode::Maximum ||
+               op_code == BinaryEWOpCode::Minimum) {
+        Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME);
+        DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+            switch (op_code) {
+                case BinaryEWOpCode::Maximum:
+                    ParallelForSYCL<MaxElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case BinaryEWOpCode::Minimum:
+                    ParallelForSYCL<MinElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                default:
+                    break;
+            }
+        });
+    } else {
+        Indexer indexer({lhs, rhs}, dst, DtypePolicy::ALL_SAME);
+        DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
+            switch (op_code) {
+                case BinaryEWOpCode::Add:
+                    ParallelForSYCL<AddElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case BinaryEWOpCode::Sub:
+                    ParallelForSYCL<SubElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case BinaryEWOpCode::Mul:
+                    ParallelForSYCL<MulElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case BinaryEWOpCode::Div:
+                    ParallelForSYCL<DivElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                default:
+                    break;
+            }
+        });
+    }
+}
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/IndexGetSet.cpp b/cpp/open3d/core/kernel/IndexGetSet.cpp
index 601b4d73f64..b880f50fb51 100644
--- a/cpp/open3d/core/kernel/IndexGetSet.cpp
+++ b/cpp/open3d/core/kernel/IndexGetSet.cpp
@@ -35,6 +35,10 @@ void IndexGet(const Tensor& src,
 
     if (src.IsCPU()) {
         IndexGetCPU(src, dst, index_tensors, indexed_shape, indexed_strides);
+    } else if (src.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        IndexGetSYCL(src, dst, index_tensors, indexed_shape, indexed_strides);
+#endif
     } else if (src.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         IndexGetCUDA(src, dst, index_tensors, indexed_shape, indexed_strides);
@@ -56,6 +60,11 @@ void IndexSet(const Tensor& src,
     if (dst.IsCPU()) {
         IndexSetCPU(src_same_device, dst, index_tensors, indexed_shape,
                     indexed_strides);
+    } else if (dst.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        IndexSetSYCL(src_same_device, dst, index_tensors, indexed_shape,
+                     indexed_strides);
+#endif
     } else if (dst.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         IndexSetCUDA(src_same_device, dst, index_tensors, indexed_shape,
diff --git a/cpp/open3d/core/kernel/IndexGetSet.h b/cpp/open3d/core/kernel/IndexGetSet.h
index 130b80836a9..c4a6d3b22c2 100644
--- a/cpp/open3d/core/kernel/IndexGetSet.h
+++ b/cpp/open3d/core/kernel/IndexGetSet.h
@@ -26,6 +26,14 @@ void IndexGetCPU(const Tensor& src,
                  const SizeVector& indexed_shape,
                  const SizeVector& indexed_strides);
 
+#ifdef BUILD_SYCL_MODULE
+void IndexGetSYCL(const Tensor& src,
+                  Tensor& dst,
+                  const std::vector<Tensor>& index_tensors,
+                  const SizeVector& indexed_shape,
+                  const SizeVector& indexed_strides);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void IndexGetCUDA(const Tensor& src,
                   Tensor& dst,
@@ -46,6 +54,14 @@ void IndexSetCPU(const Tensor& src,
                  const SizeVector& indexed_shape,
                  const SizeVector& indexed_strides);
 
+#ifdef BUILD_SYCL_MODULE
+void IndexSetSYCL(const Tensor& src,
+                  Tensor& dst,
+                  const std::vector<Tensor>& index_tensors,
+                  const SizeVector& indexed_shape,
+                  const SizeVector& indexed_strides);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void IndexSetCUDA(const Tensor& src,
                   Tensor& dst,
diff --git a/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp b/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp
new file mode 100644
index 00000000000..dfc56397417
--- /dev/null
+++ b/cpp/open3d/core/kernel/IndexGetSetSYCL.cpp
@@ -0,0 +1,76 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/AdvancedIndexing.h"
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/kernel/IndexGetSet.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+void IndexGetSYCL(const Tensor& src,
+                  Tensor& dst,
+                  const std::vector<Tensor>& index_tensors,
+                  const SizeVector& indexed_shape,
+                  const SizeVector& indexed_strides) {
+    Dtype dtype = src.GetDtype();
+    AdvancedIndexer ai(src, dst, index_tensors, indexed_shape, indexed_strides,
+                       AdvancedIndexer::AdvancedIndexerMode::GET);
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice());
+    if (dtype.IsObject()) {
+        int64_t object_byte_size = dtype.ByteSize();
+        for (int64_t idx = 0; idx < ai.NumWorkloads(); ++idx) {
+            queue.memcpy(ai.GetOutputPtr(idx), ai.GetInputPtr(idx),
+                         object_byte_size);
+        }
+    } else {
+        DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dtype, [&]() {
+            queue.parallel_for(ai.NumWorkloads(), [ai](int64_t idx) {
+                     // char* -> scalar_t* needs reinterpret_cast
+                     *reinterpret_cast<scalar_t*>(ai.GetOutputPtr(idx)) =
+                             *reinterpret_cast<const scalar_t*>(
+                                     ai.GetInputPtr(idx));
+                 }).wait_and_throw();
+        });
+    }
+}
+
+void IndexSetSYCL(const Tensor& src,
+                  Tensor& dst,
+                  const std::vector<Tensor>& index_tensors,
+                  const SizeVector& indexed_shape,
+                  const SizeVector& indexed_strides) {
+    Dtype dtype = src.GetDtype();
+    AdvancedIndexer ai(src, dst, index_tensors, indexed_shape, indexed_strides,
+                       AdvancedIndexer::AdvancedIndexerMode::SET);
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice());
+    if (dtype.IsObject()) {
+        int64_t object_byte_size = dtype.ByteSize();
+        for (int64_t idx = 0; idx < ai.NumWorkloads(); ++idx) {
+            queue.memcpy(ai.GetOutputPtr(idx), ai.GetInputPtr(idx),
+                         object_byte_size);
+        }
+    } else {
+        DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dtype, [&]() {
+            queue.parallel_for(ai.NumWorkloads(), [ai](int64_t idx) {
+                     // char* -> scalar_t* needs reinterpret_cast
+                     *reinterpret_cast<scalar_t*>(ai.GetOutputPtr(idx)) =
+                             *reinterpret_cast<const scalar_t*>(
+                                     ai.GetInputPtr(idx));
+                 }).wait_and_throw();
+        });
+    }
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/IndexReduction.cpp b/cpp/open3d/core/kernel/IndexReduction.cpp
index 19265d36620..e9c9df3f388 100644
--- a/cpp/open3d/core/kernel/IndexReduction.cpp
+++ b/cpp/open3d/core/kernel/IndexReduction.cpp
@@ -35,6 +35,10 @@ void IndexAdd_(int64_t dim,
 
     if (dst.IsCPU()) {
         IndexAddCPU_(dim, index, src_permute, dst_permute);
+    } else if (dst.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        IndexAddSYCL_(dim, index, src_permute, dst_permute);
+#endif
     } else if (dst.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         IndexAddCUDA_(dim, index, src_permute, dst_permute);
diff --git a/cpp/open3d/core/kernel/IndexReduction.h b/cpp/open3d/core/kernel/IndexReduction.h
index f4fd3516b41..2b4e122f3f0 100644
--- a/cpp/open3d/core/kernel/IndexReduction.h
+++ b/cpp/open3d/core/kernel/IndexReduction.h
@@ -24,6 +24,13 @@ void IndexAddCPU_(int64_t dim,
                   const Tensor& src,
                   Tensor& dst);
 
+#ifdef BUILD_SYCL_MODULE
+void IndexAddSYCL_(int64_t dim,
+                   const Tensor& index,
+                   const Tensor& src,
+                   Tensor& dst);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void IndexAddCUDA_(int64_t dim,
                    const Tensor& index,
diff --git a/cpp/open3d/core/kernel/IndexReductionSYCL.cpp b/cpp/open3d/core/kernel/IndexReductionSYCL.cpp
new file mode 100644
index 00000000000..47da284dc93
--- /dev/null
+++ b/cpp/open3d/core/kernel/IndexReductionSYCL.cpp
@@ -0,0 +1,61 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/Indexer.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+void IndexAddSYCL_(int64_t dim,
+                   const Tensor& index,
+                   const Tensor& src,
+                   Tensor& dst) {
+    // index: [N,], src: [N, D], dst: [M, D]
+    // In Indexer, output shape defines the actual primary strides.
+    // However, in IndexAdd_, input dominates the iterations.
+    // So put dst (output) at indexer's input, and src (input) at output.
+    Indexer indexer({dst}, src, DtypePolicy::NONE);
+
+    // Index is simply a 1D contiguous tensor, with a different stride
+    // behavior to src. So use raw pointer for simplicity.
+    auto index_ptr = index.GetDataPtr<int64_t>();
+
+    int64_t broadcasting_elems = 1;
+    for (int64_t d = 1; d < src.NumDims(); ++d) {
+        broadcasting_elems *= src.GetShape(d);
+    }
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(src.GetDevice());
+
+    // TODO: Replace with SYCL reduction API
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
+        queue.parallel_for(index.GetLength(), [=](int64_t workload_idx) {
+                 int64_t reduction_idx = workload_idx / broadcasting_elems;
+                 int64_t broadcasting_idx = workload_idx % broadcasting_elems;
+
+                 const int64_t idx = index_ptr[reduction_idx];
+                 int64_t dst_idx = idx * broadcasting_elems + broadcasting_idx;
+
+                 // Note input and output is switched here to adapt to the
+                 // indexer
+                 scalar_t* src_ptr = indexer.GetOutputPtr<scalar_t>(0, idx);
+                 scalar_t* dst_ptr = indexer.GetInputPtr<scalar_t>(0, dst_idx);
+                 sycl::atomic_ref<scalar_t, sycl::memory_order::acq_rel,
+                                  sycl::memory_scope::device>(*dst_ptr) +=
+                         *src_ptr;
+             }).wait_and_throw();
+    });
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/NonZero.cpp b/cpp/open3d/core/kernel/NonZero.cpp
index 6d04f94e40a..686ce9ea885 100644
--- a/cpp/open3d/core/kernel/NonZero.cpp
+++ b/cpp/open3d/core/kernel/NonZero.cpp
@@ -18,6 +18,12 @@ namespace kernel {
 Tensor NonZero(const Tensor& src) {
     if (src.IsCPU()) {
         return NonZeroCPU(src);
+    } else if (src.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        return NonZeroSYCL(src);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else if (src.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         return NonZeroCUDA(src);
diff --git a/cpp/open3d/core/kernel/NonZero.h b/cpp/open3d/core/kernel/NonZero.h
index ab926ddd8c3..36ac8b4f5a0 100644
--- a/cpp/open3d/core/kernel/NonZero.h
+++ b/cpp/open3d/core/kernel/NonZero.h
@@ -17,6 +17,10 @@ Tensor NonZero(const Tensor& src);
 
 Tensor NonZeroCPU(const Tensor& src);
 
+#ifdef BUILD_SYCL_MODULE
+Tensor NonZeroSYCL(const Tensor& src);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 Tensor NonZeroCUDA(const Tensor& src);
 #endif
diff --git a/cpp/open3d/core/kernel/NonZeroSYCL.cpp b/cpp/open3d/core/kernel/NonZeroSYCL.cpp
new file mode 100644
index 00000000000..887f297c9b3
--- /dev/null
+++ b/cpp/open3d/core/kernel/NonZeroSYCL.cpp
@@ -0,0 +1,76 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <numeric>
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+
+#include "open3d/core/Indexer.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/kernel/NonZero.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+Tensor NonZeroSYCL(const Tensor& src) {
+    // Get flattened non-zero indices.
+    TensorIterator src_iter(src);
+    const int64_t num_elements = src.NumElements();
+    auto device = src.GetDevice();
+    Tensor indices = Tensor::Arange(0, num_elements, 1, core::Int64, device);
+    Tensor non_zero_indices(SizeVector({num_elements}), Int64, device);
+    int64_t *non_zero_indices_ptr = non_zero_indices.GetDataPtr<int64_t>(),
+            *indices_ptr = indices.GetDataPtr<int64_t>();
+    size_t num_non_zeros;
+    DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src.GetDtype(), [&]() {
+        auto it = std::copy_if(
+                oneapi::dpl::execution::dpcpp_default, indices_ptr,
+                indices_ptr + num_elements, non_zero_indices_ptr,
+                [src_iter](int64_t index) {
+                    auto src_ptr = static_cast<const scalar_t*>(
+                            src_iter.GetPtr(index));
+                    OPEN3D_ASSERT(src_ptr != nullptr && "Internal error.");
+                    return *src_ptr != 0;
+                });
+        num_non_zeros = std::distance(non_zero_indices_ptr, it);
+    });
+
+    // Transform flattened indices to indices in each dimension.
+    const auto num_dims = src.NumDims();
+    SizeVector shape = src.GetShape();
+    // MAX_DIMS: Maximum number of dimensions of TensorRef, defined in
+    // Indexer.h.
+    sycl::marray<int64_t, MAX_DIMS> shape_vec;  // device copyable
+    if (shape.size() > MAX_DIMS) {
+        utility::LogError("Too many dimensions: {} > MAX_DIMS={}.",
+                          shape.size(), MAX_DIMS);
+    }
+    for (auto k = 0; k < num_dims; ++k) shape_vec[k] = shape[k];
+    Tensor result({num_dims, static_cast<int64_t>(num_non_zeros)}, Int64,
+                  device);
+    int64_t* result_ptr = result.GetDataPtr<int64_t>();
+    auto queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+
+    queue.parallel_for(num_non_zeros, [=](int64_t i) {
+             auto non_zero_index = non_zero_indices_ptr[i];
+             auto this_result_ptr =
+                     result_ptr + i + (num_dims - 1) * num_non_zeros;
+             OPEN3D_ASSERT(this_result_ptr != nullptr && "Internal error.");
+             for (auto dim = num_dims - 1; dim >= 0;
+                  dim--, this_result_ptr -= num_non_zeros) {
+                 *this_result_ptr = non_zero_index % shape_vec[dim];
+                 non_zero_index = non_zero_index / shape_vec[dim];
+             }
+         }).wait_and_throw();
+    return result;
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/Reduction.cpp b/cpp/open3d/core/kernel/Reduction.cpp
index a94087d609a..1c022f27dc8 100644
--- a/cpp/open3d/core/kernel/Reduction.cpp
+++ b/cpp/open3d/core/kernel/Reduction.cpp
@@ -37,6 +37,10 @@ void Reduction(const Tensor& src,
                         dims);
             }
         }
+        if (src.NumElements() == 0) {
+            utility::LogError(
+                    "Zero-size Tensor does not support Arg Reductions.");
+        }
     }
 
     SizeVector keepdim_shape =
@@ -71,6 +75,12 @@ void Reduction(const Tensor& src,
 
     if (src.IsCPU()) {
         ReductionCPU(src, dst, dims, keepdim, op_code);
+    } else if (src.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ReductionSYCL(src, dst, dims, keepdim, op_code);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else if (src.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         ReductionCUDA(src, dst, dims, keepdim, op_code);
diff --git a/cpp/open3d/core/kernel/Reduction.h b/cpp/open3d/core/kernel/Reduction.h
index a1a4b74e3a3..ec3bec62ab7 100644
--- a/cpp/open3d/core/kernel/Reduction.h
+++ b/cpp/open3d/core/kernel/Reduction.h
@@ -59,6 +59,14 @@ void ReductionCPU(const Tensor& src,
                   bool keepdim,
                   ReductionOpCode op_code);
 
+#ifdef BUILD_SYCL_MODULE
+void ReductionSYCL(const Tensor& src,
+                   Tensor& dst,
+                   const SizeVector& dims,
+                   bool keepdim,
+                   ReductionOpCode op_code);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void ReductionCUDA(const Tensor& src,
                    Tensor& dst,
diff --git a/cpp/open3d/core/kernel/ReductionSYCL.cpp b/cpp/open3d/core/kernel/ReductionSYCL.cpp
new file mode 100644
index 00000000000..8c93da87729
--- /dev/null
+++ b/cpp/open3d/core/kernel/ReductionSYCL.cpp
@@ -0,0 +1,320 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <limits>
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/Indexer.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/kernel/Reduction.h"
+#include "open3d/utility/Logging.h"
+#include "open3d/utility/Parallel.h"
+
+namespace open3d {
+namespace core {
+namespace kernel {
+
+namespace {
+
+template <typename scalar_t>
+struct ArgMinReduction {
+    using basic_reduction = sycl::minimum<scalar_t>;
+    std::pair<int64_t, scalar_t> operator()(int64_t a_idx,
+                                            scalar_t a_val,
+                                            int64_t b_idx,
+                                            scalar_t b_val) const {
+        return a_val < b_val ? std::make_pair(a_idx, a_val)
+                             : std::make_pair(b_idx, b_val);
+    }
+};
+
+template <typename scalar_t>
+struct ArgMaxReduction {
+    using basic_reduction = sycl::maximum<scalar_t>;
+    std::pair<int64_t, scalar_t> operator()(int64_t a_idx,
+                                            scalar_t a_val,
+                                            int64_t b_idx,
+                                            scalar_t b_val) const {
+        return a_val > b_val ? std::make_pair(a_idx, a_val)
+                             : std::make_pair(b_idx, b_val);
+    }
+};
+
+// TODO: This launches one kernel per output element, which can be inefficient
+// in cases where the reduction dim is small but the non-reduced dim is large.
+// Unit tests for a large number of outputs are disabled.
+// Speed-up by launching one kernel for the entire reduction.
+template <class ReductionOp, typename scalar_t>
+void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {
+    auto device_props =
+            sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    auto queue = device_props.queue;
+    auto work_group_size = device_props.max_work_group_size;
+    size_t log2elements_per_group = 13;
+    auto elements_per_group = (1 << log2elements_per_group);  // 8192
+    size_t log2workitems_per_group = 8;
+    auto workitems_per_group = (1 << log2workitems_per_group);  // 256
+    auto elements_per_work_item =
+            elements_per_group / workitems_per_group;  // 32 (= max SIMD size)
+    auto mask = ~(~0 << log2workitems_per_group);
+    ReductionOp red_op;
+
+    for (int64_t output_idx = 0; output_idx < indexer.NumOutputElements();
+         output_idx++) {
+        // sub_indexer.NumWorkloads() == ipo.
+        // sub_indexer's workload_idx is indexer's ipo_idx.
+        Indexer scalar_out_indexer = indexer.GetPerOutputIndexer(output_idx);
+        auto num_elements = scalar_out_indexer.NumWorkloads();
+        auto num_work_groups = num_elements / elements_per_group;
+        if (num_elements > elements_per_group * num_work_groups)
+            ++num_work_groups;
+        // ensure each work group has work_group_size work items
+        auto num_work_items = num_work_groups * work_group_size;
+
+        auto red_cg = [&](auto& cgh) {
+            auto output = scalar_out_indexer.GetOutputPtr<scalar_t>(0);
+            // Setting this still doesn't initialize to identity -
+            // output buffer must be initialized separately.
+            auto sycl_reducer = sycl::reduction(
+                    output, identity, red_op,
+                    {sycl::property::reduction::initialize_to_identity()});
+            cgh.parallel_for(
+                    sycl::nd_range<1>{num_work_items, work_group_size},
+                    sycl_reducer, [=](sycl::nd_item<1> item, auto& red_arg) {
+                        auto glob_id = item.get_global_id(0);
+                        auto offset = ((glob_id >> log2workitems_per_group)
+                                       << log2elements_per_group) +
+                                      (glob_id & mask);
+                        auto item_out = identity;
+                        for (size_t i = 0; i < elements_per_work_item; i++) {
+                            size_t idx =
+                                    (i << log2workitems_per_group) + offset;
+                            if (idx >= num_elements) break;
+                            auto val =
+                                    *scalar_out_indexer.GetInputPtr<scalar_t>(
+                                            0, idx);
+                            item_out = red_op(item_out, val);
+                        }
+                        red_arg.combine(item_out);
+                    });
+        };
+
+        auto e = queue.submit(red_cg);
+    }
+    queue.wait_and_throw();
+}
+
+// Based on OneAPI GPU optimization guide code sample (Blocked access to
+// input data + SYCL builtin reduction ops for final reduction)
+// TODO: This launches one kernel per output element, which can be inefficient
+// in cases where the reduction dim is small but the non-reduced dim is large.
+// Speed-up by launching one kernel for the entire reduction.
+template <class ReductionOp, typename scalar_t>
+void SYCLArgReductionEngine(Device device, Indexer indexer, scalar_t identity) {
+    auto device_props =
+            sy::SYCLContext::GetInstance().GetDeviceProperties(device);
+    auto queue = device_props.queue;
+    auto work_group_size = device_props.max_work_group_size;
+    size_t log2elements_per_group = 13;
+    auto elements_per_group = (1 << log2elements_per_group);  // 8192
+    size_t log2workitems_per_group = 8;
+    auto workitems_per_group = (1 << log2workitems_per_group);  // 256
+    auto elements_per_work_item =
+            elements_per_group / workitems_per_group;  // 32 (= max SIMD size)
+    auto mask = ~(~0 << log2workitems_per_group);
+    ReductionOp red_op;
+
+    // atomic flag. Must be 4 bytes.
+    sycl::buffer<int32_t, 1> output_in_use{indexer.NumOutputElements()};
+    auto e_fill = queue.submit([&](auto& cgh) {
+        auto acc_output_in_use =
+                output_in_use.get_access<sycl::access_mode::write>(cgh);
+        cgh.fill(acc_output_in_use, 0);
+    });
+
+    for (int64_t output_idx = 0; output_idx < indexer.NumOutputElements();
+         output_idx++) {
+        // sub_indexer.NumWorkloads() == ipo.
+        // sub_indexer's workload_idx is indexer's ipo_idx.
+        Indexer scalar_out_indexer = indexer.GetPerOutputIndexer(output_idx);
+        auto num_elements = scalar_out_indexer.NumWorkloads();
+        auto num_work_groups = num_elements / elements_per_group;
+        if (num_elements > elements_per_group * num_work_groups)
+            ++num_work_groups;
+        // ensure each work group has work_group_size work items
+        auto num_work_items = num_work_groups * work_group_size;
+
+        sycl::buffer<int32_t, 1> this_output_in_use{output_in_use, output_idx,
+                                                    1};
+        auto arg_red_cg = [&](auto& cgh) {
+            auto acc_in_use =
+                    this_output_in_use
+                            .get_access<sycl::access_mode::read_write>(cgh);
+            cgh.parallel_for(
+                    sycl::nd_range<1>{num_work_items, work_group_size},
+                    [=](sycl::nd_item<1> item) {
+                        auto& out_idx =
+                                *scalar_out_indexer.GetOutputPtr<int64_t>(0, 0);
+                        auto& out_val =
+                                *scalar_out_indexer.GetOutputPtr<scalar_t>(1,
+                                                                           0);
+                        auto glob_id = item.get_global_id(0);
+                        auto this_group = item.get_group();
+                        auto offset = ((glob_id >> log2workitems_per_group)
+                                       << log2elements_per_group) +
+                                      (glob_id & mask);
+                        int64_t it_idx = 0;
+                        scalar_t it_val = identity;
+                        for (size_t i = 0; i < elements_per_work_item; i++) {
+                            size_t idx =
+                                    (i << log2workitems_per_group) + offset;
+                            if (idx >= num_elements) break;
+                            auto val =
+                                    *scalar_out_indexer.GetInputPtr<scalar_t>(
+                                            0, idx);
+                            std::tie(it_idx, it_val) =
+                                    red_op(it_idx, it_val, idx, val);
+                        }
+                        auto group_out_val = sycl::reduce_over_group(
+                                this_group, it_val, identity,
+                                typename ReductionOp::basic_reduction());
+                        // atomic (serial) reduction over all groups. SYCL does
+                        // not have a barrier over groups. Work item(s) with min
+                        // / max value update the output. (non-deterministic)
+                        if (it_val == group_out_val) {
+                            // TODO: Look for a better option to a spinlock
+                            // mutex.
+                            auto in_use = sycl::atomic_ref<
+                                    int32_t, sycl::memory_order::acq_rel,
+                                    sycl::memory_scope::device>(acc_in_use[0]);
+                            while (in_use.exchange(1) == 1) {
+                            }
+                            std::tie(out_idx, out_val) = red_op(
+                                    out_idx, out_val, it_idx, group_out_val);
+                            in_use.store(0);
+                        }
+                    });
+        };
+
+        auto e = queue.submit(arg_red_cg);
+    }
+    queue.wait_and_throw();
+}
+}  // namespace
+
+void ReductionSYCL(const Tensor& src,
+                   Tensor& dst,
+                   const SizeVector& dims,
+                   bool keepdim,
+                   ReductionOpCode op_code) {
+    Device device = src.GetDevice();
+    if (s_regular_reduce_ops.find(op_code) != s_regular_reduce_ops.end()) {
+        Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims);
+        DISPATCH_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
+            scalar_t identity;
+            switch (op_code) {
+                case ReductionOpCode::Sum:
+                    dst.Fill(0);
+                    SYCLReductionEngine<sycl::plus<scalar_t>, scalar_t>(
+                            device, indexer, 0);
+                    break;
+                case ReductionOpCode::Prod:
+                    dst.Fill(1);
+                    SYCLReductionEngine<sycl::multiplies<scalar_t>, scalar_t>(
+                            device, indexer, 1);
+                    break;
+                case ReductionOpCode::Min:
+                    if (indexer.NumWorkloads() == 0) {
+                        utility::LogError(
+                                "Zero-size Tensor does not support Min.");
+                    } else {
+                        identity = std::numeric_limits<scalar_t>::max();
+                        dst.Fill(identity);
+                        SYCLReductionEngine<sycl::minimum<scalar_t>, scalar_t>(
+                                device, indexer, identity);
+                    }
+                    break;
+                case ReductionOpCode::Max:
+                    if (indexer.NumWorkloads() == 0) {
+                        utility::LogError(
+                                "Zero-size Tensor does not support Max.");
+                    } else {
+                        identity = std::numeric_limits<scalar_t>::lowest();
+                        dst.Fill(identity);
+                        SYCLReductionEngine<sycl::maximum<scalar_t>, scalar_t>(
+                                device, indexer, identity);
+                    }
+                    break;
+                default:
+                    utility::LogError("Unsupported op code.");
+                    break;
+            }
+        });
+    } else if (s_arg_reduce_ops.find(op_code) != s_arg_reduce_ops.end()) {
+        if (dst.GetDtype() != core::Int64) {
+            utility::LogError("Arg-reduction must have int64 output dtype.");
+        }
+        // Accumulation buffer to store temporary min/max values.
+        Tensor dst_acc(dst.GetShape(), src.GetDtype(), src.GetDevice());
+        Indexer indexer({src}, {dst, dst_acc}, DtypePolicy::INPUT_SAME, dims);
+        DISPATCH_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
+            scalar_t identity;
+            switch (op_code) {
+                case ReductionOpCode::ArgMin:
+                    identity = std::numeric_limits<scalar_t>::max();
+                    dst_acc.Fill(identity);
+                    SYCLArgReductionEngine<ArgMinReduction<scalar_t>, scalar_t>(
+                            device, indexer, identity);
+                    break;
+                case ReductionOpCode::ArgMax:
+                    identity = std::numeric_limits<scalar_t>::lowest();
+                    dst_acc.Fill(identity);
+                    SYCLArgReductionEngine<ArgMaxReduction<scalar_t>, scalar_t>(
+                            device, indexer, identity);
+                    break;
+                default:
+                    utility::LogError("Unsupported op code.");
+                    break;
+            }
+        });
+    } else if (s_boolean_reduce_ops.find(op_code) !=
+               s_boolean_reduce_ops.end()) {
+        if (src.GetDtype() != core::Bool) {
+            utility::LogError(
+                    "Boolean reduction only supports boolean input tensor.");
+        }
+        if (dst.GetDtype() != core::Bool) {
+            utility::LogError(
+                    "Boolean reduction only supports boolean output tensor.");
+        }
+        Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims);
+        switch (op_code) {
+            case ReductionOpCode::All:
+                // Identity == true. 0-sized tensor, returns true.
+                dst.Fill(true);
+                SYCLReductionEngine<sycl::logical_and<bool>, bool>(
+                        device, indexer, true);
+                break;
+            case ReductionOpCode::Any:
+                // Identity == false. 0-sized tensor, returns false.
+                dst.Fill(false);
+                SYCLReductionEngine<sycl::logical_or<bool>, bool>(
+                        device, indexer, false);
+                break;
+            default:
+                utility::LogError("Unsupported op code.");
+                break;
+        }
+    } else {
+        utility::LogError("Unsupported op code.");
+    }
+}
+
+}  // namespace kernel
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/kernel/UnaryEW.cpp b/cpp/open3d/core/kernel/UnaryEW.cpp
index 911b2885e34..2513d41205d 100644
--- a/cpp/open3d/core/kernel/UnaryEW.cpp
+++ b/cpp/open3d/core/kernel/UnaryEW.cpp
@@ -22,6 +22,19 @@ void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
                           src.GetShape(), dst.GetShape());
     }
 
+    // Check dtype compatibility
+    const auto float_only_ops = {UnaryEWOpCode::Sqrt,    UnaryEWOpCode::Sin,
+                                 UnaryEWOpCode::Cos,     UnaryEWOpCode::Exp,
+                                 UnaryEWOpCode::IsNan,   UnaryEWOpCode::IsInf,
+                                 UnaryEWOpCode::IsFinite};
+    Dtype src_dtype = src.GetDtype();
+    if (std::find(float_only_ops.begin(), float_only_ops.end(), op_code) !=
+                float_only_ops.end() &&
+        src_dtype != core::Float32 && src_dtype != core::Float64) {
+        utility::LogError("Only supports Float32 and Float64, but {} is used.",
+                          src_dtype.ToString());
+    }
+
     // Dispatch to device
     Device src_device = src.GetDevice();
     Device dst_device = dst.GetDevice();
@@ -32,6 +45,12 @@ void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
 
     if (src_device.IsCPU()) {
         UnaryEWCPU(src, dst, op_code);
+    } else if (src_device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        UnaryEWSYCL(src, dst, op_code);
+#else
+        utility::LogError("Not compiled with SYCL, but SYCL device is used.");
+#endif
     } else if (src_device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         UnaryEWCUDA(src, dst, op_code);
@@ -73,6 +92,8 @@ void Copy(const Tensor& src, Tensor& dst) {
 #else
         utility::LogError("Not compiled with SYCL, but SYCL device is used.");
 #endif
+    } else {
+        utility::LogError("Copy: SYCL <-> CUDA Unimplemented device");
     }
 }
 
diff --git a/cpp/open3d/core/kernel/UnaryEW.h b/cpp/open3d/core/kernel/UnaryEW.h
index 907b3371167..131dff08b91 100644
--- a/cpp/open3d/core/kernel/UnaryEW.h
+++ b/cpp/open3d/core/kernel/UnaryEW.h
@@ -32,15 +32,18 @@ enum class UnaryEWOpCode {
 };
 
 void UnaryEW(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code);
-
 void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code);
 
+#ifdef BUILD_SYCL_MODULE
+void UnaryEWSYCL(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void UnaryEWCUDA(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code);
 #endif
 
-// Copy is separated from other unary ops since it support cross-device copy and
-// dtype casting.
+// Copy is separated from other unary ops since it supports cross-device copy
+// and dtype casting.
 void Copy(const Tensor& src, Tensor& dst);
 
 void CopyCPU(const Tensor& src, Tensor& dst);
diff --git a/cpp/open3d/core/kernel/UnaryEWCPU.cpp b/cpp/open3d/core/kernel/UnaryEWCPU.cpp
index 1a502e42a1d..f4673200b24 100644
--- a/cpp/open3d/core/kernel/UnaryEWCPU.cpp
+++ b/cpp/open3d/core/kernel/UnaryEWCPU.cpp
@@ -218,14 +218,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
     Dtype src_dtype = src.GetDtype();
     Dtype dst_dtype = dst.GetDtype();
 
-    auto assert_dtype_is_float = [](Dtype dtype) -> void {
-        if (dtype != core::Float32 && dtype != core::Float64) {
-            utility::LogError(
-                    "Only supports Float32 and Float64, but {} is used.",
-                    dtype.ToString());
-        }
-    };
-
     if (op_code == UnaryEWOpCode::LogicalNot) {
         if (dst_dtype == src_dtype) {
             Indexer indexer({src}, dst, DtypePolicy::ALL_SAME);
@@ -259,7 +251,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
     } else if (op_code == UnaryEWOpCode::IsNan ||
                op_code == UnaryEWOpCode::IsInf ||
                op_code == UnaryEWOpCode::IsFinite) {
-        assert_dtype_is_float(src_dtype);
         Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
 #ifdef BUILD_ISPC_MODULE
         ispc::Indexer ispc_indexer = indexer.ToISPC();
@@ -291,7 +282,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
         DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
             switch (op_code) {
                 case UnaryEWOpCode::Sqrt:
-                    assert_dtype_is_float(src_dtype);
                     LaunchUnaryEWKernel<scalar_t, scalar_t>(
                             indexer, CPUSqrtElementKernel<scalar_t>,
                             OPEN3D_TEMPLATE_VECTORIZED(scalar_t,
@@ -299,7 +289,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
                                                        &ispc_indexer));
                     break;
                 case UnaryEWOpCode::Sin:
-                    assert_dtype_is_float(src_dtype);
                     LaunchUnaryEWKernel<scalar_t, scalar_t>(
                             indexer, CPUSinElementKernel<scalar_t>,
                             OPEN3D_TEMPLATE_VECTORIZED(scalar_t,
@@ -307,7 +296,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
                                                        &ispc_indexer));
                     break;
                 case UnaryEWOpCode::Cos:
-                    assert_dtype_is_float(src_dtype);
                     LaunchUnaryEWKernel<scalar_t, scalar_t>(
                             indexer, CPUCosElementKernel<scalar_t>,
                             OPEN3D_TEMPLATE_VECTORIZED(scalar_t,
@@ -322,7 +310,6 @@ void UnaryEWCPU(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
                                                        &ispc_indexer));
                     break;
                 case UnaryEWOpCode::Exp:
-                    assert_dtype_is_float(src_dtype);
                     LaunchUnaryEWKernel<scalar_t, scalar_t>(
                             indexer, CPUExpElementKernel<scalar_t>,
                             OPEN3D_TEMPLATE_VECTORIZED(scalar_t,
diff --git a/cpp/open3d/core/kernel/UnaryEWSYCL.cpp b/cpp/open3d/core/kernel/UnaryEWSYCL.cpp
index fed05af7b47..af99df1c91e 100644
--- a/cpp/open3d/core/kernel/UnaryEWSYCL.cpp
+++ b/cpp/open3d/core/kernel/UnaryEWSYCL.cpp
@@ -8,8 +8,12 @@
 #include <cmath>
 #include <cstring>
 
+#include "open3d/core/Dispatch.h"
 #include "open3d/core/Dtype.h"
+#include "open3d/core/Indexer.h"
 #include "open3d/core/MemoryManager.h"
+#include "open3d/core/ParallelFor.h"
+#include "open3d/core/ParallelForSYCL.h"
 #include "open3d/core/SizeVector.h"
 #include "open3d/core/Tensor.h"
 #include "open3d/core/kernel/UnaryEW.h"
@@ -19,32 +23,265 @@ namespace open3d {
 namespace core {
 namespace kernel {
 
+namespace {
+
+struct UnaryElementKernel {
+    UnaryElementKernel(Indexer indexer_) : indexer(indexer_) {}
+    void operator()(int64_t i) {}
+
+protected:
+    Indexer indexer;
+};
+
+template <typename src_t, typename dst_t>
+struct CopyElementKernel : public UnaryElementKernel {
+    using UnaryElementKernel::UnaryElementKernel;
+    void operator()(int64_t i) {
+        const src_t* src = indexer.GetInputPtr<src_t>(0, i);
+        dst_t* dst = indexer.GetOutputPtr<dst_t>(i);
+        *dst = static_cast<dst_t>(*src);
+    }
+};
+
+// Math: integers treated as double (C++11)
+// no casting needed for float
+#define UNARY_ELEMENT_KERNEL(name, elem_op)                                \
+    template <typename src_t>                                              \
+    struct name##ElementKernel : public UnaryElementKernel {               \
+        using UnaryElementKernel::UnaryElementKernel;                      \
+        void operator()(int64_t i) {                                       \
+            const src_t* src = indexer.GetInputPtr<src_t>(0, i);           \
+            src_t* dst = indexer.GetOutputPtr<src_t>(i);                   \
+            *dst = static_cast<src_t>(elem_op(static_cast<double>(*src))); \
+        }                                                                  \
+    };                                                                     \
+    template <>                                                            \
+    struct name##ElementKernel<float> : public UnaryElementKernel {        \
+        using UnaryElementKernel::UnaryElementKernel;                      \
+        void operator()(int64_t i) {                                       \
+            const float* src = indexer.GetInputPtr<float>(0, i);           \
+            float* dst = indexer.GetOutputPtr<float>(i);                   \
+            *dst = elem_op(*src);                                          \
+        }                                                                  \
+    }
+
+UNARY_ELEMENT_KERNEL(Sqrt, sycl::sqrt);
+UNARY_ELEMENT_KERNEL(Sin, sycl::sin);
+UNARY_ELEMENT_KERNEL(Cos, sycl::cos);
+UNARY_ELEMENT_KERNEL(Exp, sycl::exp);
+// TODO: Use sycl::abs for integers (no casting)
+UNARY_ELEMENT_KERNEL(Abs, sycl::fabs);
+UNARY_ELEMENT_KERNEL(Floor, sycl::floor);
+UNARY_ELEMENT_KERNEL(Ceil, sycl::ceil);
+UNARY_ELEMENT_KERNEL(Round, sycl::round);
+UNARY_ELEMENT_KERNEL(Trunc, sycl::trunc);
+#undef UNARY_ELEMENT_KERNEL
+
+// No special treatment for unsigned types - we use the SYCL runtime
+// default
+template <typename scalar_t>
+struct NegElementKernel : public UnaryElementKernel {
+    using UnaryElementKernel::UnaryElementKernel;
+    void operator()(int64_t i) {
+        const scalar_t* src = indexer.GetInputPtr<scalar_t>(0, i);
+        scalar_t* dst = indexer.GetOutputPtr<scalar_t>(i);
+        *dst = -*src;
+    }
+};
+
+// Float checkers: integers treated as double (C++11)
+// no casting needed for float
+#define UNARY_ELEMENT_KERNEL(name, elem_op)                         \
+    template <typename src_t>                                       \
+    struct name##ElementKernel : public UnaryElementKernel {        \
+        using UnaryElementKernel::UnaryElementKernel;               \
+        void operator()(int64_t i) {                                \
+            const src_t* src = indexer.GetInputPtr<src_t>(0, i);    \
+            bool* dst = indexer.GetOutputPtr<bool>(i);              \
+            *dst = elem_op(static_cast<double>(*src));              \
+        }                                                           \
+    };                                                              \
+    template <>                                                     \
+    struct name##ElementKernel<float> : public UnaryElementKernel { \
+        using UnaryElementKernel::UnaryElementKernel;               \
+        void operator()(int64_t i) {                                \
+            const float* src = indexer.GetInputPtr<float>(0, i);    \
+            bool* dst = indexer.GetOutputPtr<bool>(i);              \
+            *dst = elem_op(*src);                                   \
+        }                                                           \
+    }
+
+UNARY_ELEMENT_KERNEL(IsNan, sycl::isnan);
+UNARY_ELEMENT_KERNEL(IsInf, sycl::isinf);
+UNARY_ELEMENT_KERNEL(IsFinite, sycl::isfinite);
+#undef UNARY_ELEMENT_KERNEL
+
+template <typename src_t, typename dst_t /* == bool or src_t */>
+struct LogicalNotElementKernel : public UnaryElementKernel {
+    using UnaryElementKernel::UnaryElementKernel;
+    void operator()(int64_t i) {
+        const src_t* src = indexer.GetInputPtr<src_t>(0, i);
+        dst_t* dst = indexer.GetOutputPtr<dst_t>(i);
+        *dst = static_cast<dst_t>(!static_cast<bool>(*src));
+    }
+};
+}  // namespace
+
 void CopySYCL(const Tensor& src, Tensor& dst) {
-    // It has been checked that
-    // - at least one of src or dst is SYCL device
+    // src and dst have been checked to have the same shape
+    // at least one of src and dst is SYCL and the other is SYCL or CPU
     SizeVector shape = src.GetShape();
+    Dtype src_dtype = src.GetDtype(), dst_dtype = dst.GetDtype();
+    Device src_device = src.GetDevice(), dst_device = dst.GetDevice();
+    Device device_with_queue = dst.IsSYCL() ? dst.GetDevice() : src.GetDevice();
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(device_with_queue);
+
+    if (src_device.IsSYCL() && dst_device.IsSYCL()) {
+        if (src.IsContiguous() && dst.IsContiguous() &&
+            src.GetShape() == dst.GetShape() && src_dtype == dst_dtype) {
+            MemoryManager::Memcpy(dst.GetDataPtr(), dst.GetDevice(),
+                                  src.GetDataPtr(), src.GetDevice(),
+                                  src_dtype.ByteSize() * shape.NumElements());
+        } else if (dst.NumElements() > 1 && dst.IsContiguous() &&
+                   src.NumElements() == 1 && !src_dtype.IsObject()) {
+            int64_t num_elements = dst.NumElements();
+            DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() {
+                scalar_t scalar_element = src.To(dst_dtype).Item<scalar_t>();
+                scalar_t* dst_ptr = dst.GetDataPtr<scalar_t>();
+                queue.fill(dst_ptr, scalar_element, num_elements)
+                        .wait_and_throw();
+            });
+        } else if (src_device == dst_device) {  // non-contiguous or broadcast
+                                                // on same SYCL device
+            Indexer indexer({src}, dst, DtypePolicy::NONE);
+            if (src.GetDtype().IsObject()) {
+                // TODO: This is likely very slow. Coalesce into less memcpy
+                // calls.
+                int64_t object_byte_size = src.GetDtype().ByteSize();
+                for (int64_t i = 0; i < indexer.NumWorkloads(); ++i) {
+                    const void* src_ptr = indexer.GetInputPtr(0, i);
+                    void* dst_ptr = indexer.GetOutputPtr(i);
+                    queue.memcpy(dst_ptr, src_ptr, object_byte_size);
+                }
+                queue.wait_and_throw();
+            } else {
+                DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+                    using src_t = scalar_t;
+                    DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(dst_dtype, [&]() {
+                        using dst_t = scalar_t;
+                        ParallelForSYCL<CopyElementKernel<src_t, dst_t>>(
+                                device_with_queue, indexer);
+                    });
+                });
+            }
+        } else {
+            dst.CopyFrom(src.Contiguous().To(dst_device));
+        }
+    } else if (src_device.IsCPU() && dst_device.IsSYCL() ||
+               src_device.IsSYCL() && dst_device.IsCPU()) {
+        Tensor src_conti = src.Contiguous();  // No op if already contiguous
+        if (dst.IsContiguous() && src.GetShape() == dst.GetShape() &&
+            src_dtype == dst_dtype) {
+            MemoryManager::Memcpy(dst.GetDataPtr(), dst_device,
+                                  src_conti.GetDataPtr(), src_conti.GetDevice(),
+                                  src_dtype.ByteSize() * shape.NumElements());
+        } else {
+            dst.CopyFrom(src.Contiguous().To(dst_device));
+        }
+    } else {
+        utility::LogError("Wrong device type {} -> {}", src_device.ToString(),
+                          dst_device.ToString());
+    }
+}
+
+void UnaryEWSYCL(const Tensor& src, Tensor& dst, UnaryEWOpCode op_code) {
+    // src and dst have been changed to have the same shape, device
     Dtype src_dtype = src.GetDtype();
     Dtype dst_dtype = dst.GetDtype();
-    Device dst_device = dst.GetDevice();
-    Device src_device = src.GetDevice();
+    Device device = src.GetDevice();  // == dst.GetDevice()
 
-    if (src_dtype != dst_dtype) {
-        utility::LogError(
-                "CopySYCL: Dtype conversion from src to dst not implemented!");
-    }
-    if ((dst_device.IsSYCL() && !dst.IsContiguous()) ||
-        (src_device.IsSYCL() && !src.IsContiguous())) {
-        utility::LogError(
-                "CopySYCL: NonContiguous SYCL tensor Copy not implemented!");
-    }
-    Tensor src_conti = src.Contiguous();  // No op if already contiguous
-    if (dst.IsContiguous() && src.GetShape() == dst.GetShape() &&
-        src_dtype == dst_dtype) {
-        MemoryManager::Memcpy(dst.GetDataPtr(), dst_device,
-                              src_conti.GetDataPtr(), src_conti.GetDevice(),
-                              src_dtype.ByteSize() * shape.NumElements());
+    if (op_code == UnaryEWOpCode::LogicalNot) {
+        if (dst_dtype == src_dtype) {
+            Indexer indexer({src}, dst, DtypePolicy::ALL_SAME);
+            DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+                ParallelForSYCL<LogicalNotElementKernel<scalar_t, scalar_t>>(
+                        device, indexer);
+            });
+        } else if (dst_dtype == Bool) {
+            Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
+            DISPATCH_DTYPE_TO_TEMPLATE_WITH_BOOL(src_dtype, [&]() {
+                ParallelForSYCL<LogicalNotElementKernel<scalar_t, bool>>(
+                        device, indexer);
+            });
+        } else {
+            utility::LogError(
+                    "Boolean op's output type must be boolean or the "
+                    "same type as the input.");
+        }
+    } else if (op_code == UnaryEWOpCode::IsNan ||
+               op_code == UnaryEWOpCode::IsInf ||
+               op_code == UnaryEWOpCode::IsFinite) {
+        Indexer indexer({src}, dst, DtypePolicy::INPUT_SAME_OUTPUT_BOOL);
+        DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
+            if (op_code == UnaryEWOpCode::IsNan) {
+                ParallelForSYCL<IsNanElementKernel<scalar_t>>(device, indexer);
+            } else if (op_code == UnaryEWOpCode::IsInf) {
+                ParallelForSYCL<IsInfElementKernel<scalar_t>>(device, indexer);
+            } else if (op_code == UnaryEWOpCode::IsFinite) {
+                ParallelForSYCL<IsFiniteElementKernel<scalar_t>>(device,
+                                                                 indexer);
+            }
+        });
     } else {
-        dst.CopyFrom(src_conti.To(dst_device));
+        Indexer indexer({src}, dst, DtypePolicy::ALL_SAME);
+        DISPATCH_DTYPE_TO_TEMPLATE(src_dtype, [&]() {
+            switch (op_code) {
+                case UnaryEWOpCode::Sqrt:
+                    ParallelForSYCL<SqrtElementKernel<scalar_t>>(device,
+                                                                 indexer);
+                    break;
+                case UnaryEWOpCode::Sin:
+                    ParallelForSYCL<SinElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case UnaryEWOpCode::Cos:
+                    ParallelForSYCL<CosElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case UnaryEWOpCode::Neg:
+                    ParallelForSYCL<NegElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case UnaryEWOpCode::Exp:
+                    ParallelForSYCL<ExpElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case UnaryEWOpCode::Abs:
+                    ParallelForSYCL<AbsElementKernel<scalar_t>>(device,
+                                                                indexer);
+                    break;
+                case UnaryEWOpCode::Floor:
+                    ParallelForSYCL<FloorElementKernel<scalar_t>>(device,
+                                                                  indexer);
+                    break;
+                case UnaryEWOpCode::Ceil:
+                    ParallelForSYCL<CeilElementKernel<scalar_t>>(device,
+                                                                 indexer);
+                    break;
+                case UnaryEWOpCode::Round:
+                    ParallelForSYCL<RoundElementKernel<scalar_t>>(device,
+                                                                  indexer);
+                    break;
+                case UnaryEWOpCode::Trunc:
+                    ParallelForSYCL<TruncElementKernel<scalar_t>>(device,
+                                                                  indexer);
+                    break;
+                default:
+                    utility::LogError("Unimplemented op_code for UnaryEWSYCL");
+                    break;
+            }
+        });
     }
 }
 
diff --git a/cpp/open3d/core/linalg/AddMM.cpp b/cpp/open3d/core/linalg/AddMM.cpp
index aea908cecbb..45cfd71df7d 100644
--- a/cpp/open3d/core/linalg/AddMM.cpp
+++ b/cpp/open3d/core/linalg/AddMM.cpp
@@ -98,6 +98,13 @@ void AddMM(const Tensor& A,
                   ldb, lda, ldc, dtype, device);
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        AddMMSYCL(B_data, A_data, C_data, n, k, m, alpha, beta, transB, transA,
+                  ldb, lda, ldc, dtype, device);
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         AddMMCPU(B_data, A_data, C_data, n, k, m, alpha, beta, transB, transA,
diff --git a/cpp/open3d/core/linalg/AddMM.h b/cpp/open3d/core/linalg/AddMM.h
index 1754e430fef..6d26703ae1b 100644
--- a/cpp/open3d/core/linalg/AddMM.h
+++ b/cpp/open3d/core/linalg/AddMM.h
@@ -20,6 +20,24 @@ namespace core {
 void AddMM(
         const Tensor& A, const Tensor& B, Tensor& C, double alpha, double beta);
 
+#ifdef BUILD_SYCL_MODULE
+void AddMMSYCL(void* A_data,
+               void* B_data,
+               void* C_data,
+               int64_t m,
+               int64_t k,
+               int64_t n,
+               double alpha,
+               double beta,
+               bool gemmTrA,
+               bool gemmTrB,
+               int lda,
+               int ldb,
+               int ldc,
+               Dtype dtype,
+               const Device& device);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void AddMMCUDA(void* A_data,
                void* B_data,
diff --git a/cpp/open3d/core/linalg/AddMMSYCL.cpp b/cpp/open3d/core/linalg/AddMMSYCL.cpp
new file mode 100644
index 00000000000..591a4f26f00
--- /dev/null
+++ b/cpp/open3d/core/linalg/AddMMSYCL.cpp
@@ -0,0 +1,49 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/AddMM.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+
+void AddMMSYCL(void* A_data,
+               void* B_data,
+               void* C_data,
+               int64_t m,
+               int64_t k,
+               int64_t n,
+               double alpha,
+               double beta,
+               bool gemmTrA,
+               bool gemmTrB,
+               int lda,
+               int ldb,
+               int ldc,
+               Dtype dtype,
+               const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        blas::column_major::gemm(queue, gemmTrA ? transpose::T : transpose::N,
+                                 gemmTrB ? transpose::T : transpose::N, m, n, k,
+                                 static_cast<scalar_t>(alpha),
+                                 static_cast<const scalar_t*>(A_data), lda,
+                                 static_cast<const scalar_t*>(B_data), ldb,
+                                 static_cast<scalar_t>(beta),
+                                 static_cast<scalar_t*>(C_data), ldc)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/Inverse.cpp b/cpp/open3d/core/linalg/Inverse.cpp
index 51cf7693217..07cb507c1ca 100644
--- a/cpp/open3d/core/linalg/Inverse.cpp
+++ b/cpp/open3d/core/linalg/Inverse.cpp
@@ -55,6 +55,20 @@ void Inverse(const Tensor &A, Tensor &output) {
         output = output.T();
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        Tensor ipiv = Tensor::Empty({n}, core::Int64, device);
+        void *ipiv_data = ipiv.GetDataPtr();
+
+        // LAPACKE supports getri, A is in-place modified as output.
+        Tensor A_T = A.T().To(device, /*copy=*/true);
+        void *A_data = A_T.GetDataPtr();
+
+        InverseSYCL(A_data, ipiv_data, nullptr, n, dtype, device);
+        output = A_T.T();
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         Dtype ipiv_dtype;
diff --git a/cpp/open3d/core/linalg/Inverse.h b/cpp/open3d/core/linalg/Inverse.h
index 8809f8f176c..2d3cbb2bbcf 100644
--- a/cpp/open3d/core/linalg/Inverse.h
+++ b/cpp/open3d/core/linalg/Inverse.h
@@ -17,11 +17,20 @@ void Inverse(const Tensor& A, Tensor& output);
 
 void InverseCPU(void* A_data,
                 void* ipiv_data,
-                void* output_data,
+                [[maybe_unused]] void* output_data,
                 int64_t n,
                 Dtype dtype,
                 const Device& device);
 
+#ifdef BUILD_SYCL_MODULE
+void InverseSYCL(void* A_data,
+                 void* ipiv_data,
+                 [[maybe_unused]] void* output_data,
+                 int64_t n,
+                 Dtype dtype,
+                 const Device& device);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void InverseCUDA(void* A_data,
                  void* ipiv_data,
diff --git a/cpp/open3d/core/linalg/InverseSYCL.cpp b/cpp/open3d/core/linalg/InverseSYCL.cpp
new file mode 100644
index 00000000000..f66dd089cda
--- /dev/null
+++ b/cpp/open3d/core/linalg/InverseSYCL.cpp
@@ -0,0 +1,48 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/Blob.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/Inverse.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+
+namespace open3d {
+namespace core {
+
+void InverseSYCL(void* A_data,
+                 void* ipiv_data,
+                 void* output_data,
+                 int64_t n,
+                 Dtype dtype,
+                 const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    int64_t lda = n;
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        // Use blob to ensure cleanup of scratchpad memory.
+        int64_t scratchpad_size = std::max(
+                lapack::getrf_scratchpad_size<scalar_t>(queue, n, n, lda),
+                lapack::getri_scratchpad_size<scalar_t>(queue, n, lda));
+        core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device);
+        auto lu_done =
+                lapack::getrf(queue, n, n, static_cast<scalar_t*>(A_data), lda,
+                              static_cast<int64_t*>(ipiv_data),
+                              static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                              scratchpad_size);
+        lapack::getri(queue, n, static_cast<scalar_t*>(A_data), lda,
+                      static_cast<int64_t*>(ipiv_data),
+                      static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                      scratchpad_size, {lu_done})
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/LU.cpp b/cpp/open3d/core/linalg/LU.cpp
index 5bfafdac425..3a9a4e4788c 100644
--- a/cpp/open3d/core/linalg/LU.cpp
+++ b/cpp/open3d/core/linalg/LU.cpp
@@ -16,9 +16,9 @@ namespace open3d {
 namespace core {
 
 // Get column permutation tensor from ipiv (swapping index array).
-static core::Tensor GetColPermutation(const Tensor& ipiv,
-                                      int number_of_indices,
-                                      int number_of_rows) {
+static Tensor GetColPermutation(const Tensor& ipiv,
+                                int number_of_indices,
+                                int number_of_rows) {
     Tensor full_ipiv =
             Tensor::Arange(0, number_of_rows, 1, core::Int32, Device("CPU:0"));
     Tensor ipiv_cpu = ipiv.To(Device("CPU:0"), core::Int32, /*copy=*/false);
@@ -42,14 +42,14 @@ static void OutputToPLU(const Tensor& output,
                         const Tensor& ipiv,
                         const bool permute_l) {
     int n = output.GetShape()[0];
-    core::Device device = output.GetDevice();
+    Device device = output.GetDevice();
 
     // Get upper and lower matrix from output matrix.
     Triul(output, upper, lower, 0);
     // Get column permutation vector from pivot indices vector.
     Tensor col_permutation = GetColPermutation(ipiv, ipiv.GetShape()[0], n);
     // Creating "Permutation Matrix (P in P.A = L.U)".
-    permutation = core::Tensor::Eye(n, output.GetDtype(), device)
+    permutation = Tensor::Eye(n, output.GetDtype(), device)
                           .IndexGet({col_permutation});
     // Calculating P in A = P.L.U. [P.Inverse() = P.T()].
     permutation = permutation.T().Contiguous();
@@ -88,15 +88,23 @@ void LUIpiv(const Tensor& A, Tensor& ipiv, Tensor& output) {
     // elements as U, (diagonal elements of L are unity), and ipiv array,
     // which has the pivot indices (for 1 <= i <= min(M,N), row i of the
     // matrix was interchanged with row IPIV(i).
+    int64_t ipiv_len = std::min(rows, cols);
     if (device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         CUDAScopedDevice scoped_device(device);
-        int64_t ipiv_len = std::min(rows, cols);
-        ipiv = core::Tensor::Empty({ipiv_len}, core::Int32, device);
+        ipiv = Tensor::Empty({ipiv_len}, core::Int32, device);
         void* ipiv_data = ipiv.GetDataPtr();
         LUCUDA(A_data, ipiv_data, rows, cols, dtype, device);
 #else
         utility::LogInfo("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        ipiv = Tensor::Empty({ipiv_len}, core::Int64, device);
+        void* ipiv_data = ipiv.GetDataPtr();
+        LUSYCL(A_data, ipiv_data, rows, cols, dtype, device);
+#else
+        utility::LogInfo("Unimplemented device.");
 #endif
     } else {
         Dtype ipiv_dtype;
@@ -107,9 +115,7 @@ void LUIpiv(const Tensor& A, Tensor& ipiv, Tensor& output) {
         } else {
             utility::LogError("Unsupported OPEN3D_CPU_LINALG_INT type.");
         }
-
-        int64_t ipiv_len = std::min(rows, cols);
-        ipiv = core::Tensor::Empty({ipiv_len}, ipiv_dtype, device);
+        ipiv = Tensor::Empty({ipiv_len}, ipiv_dtype, device);
         void* ipiv_data = ipiv.GetDataPtr();
         LUCPU(A_data, ipiv_data, rows, cols, dtype, device);
     }
@@ -125,7 +131,7 @@ void LU(const Tensor& A,
     AssertTensorDtypes(A, {Float32, Float64});
 
     // Get output matrix and ipiv.
-    core::Tensor ipiv, output;
+    Tensor ipiv, output;
     LUIpiv(A, ipiv, output);
 
     // Decompose output in P, L, U matrix form.
diff --git a/cpp/open3d/core/linalg/LUImpl.h b/cpp/open3d/core/linalg/LUImpl.h
index 36898fa3de9..fde8f9df385 100644
--- a/cpp/open3d/core/linalg/LUImpl.h
+++ b/cpp/open3d/core/linalg/LUImpl.h
@@ -21,6 +21,15 @@ void LUCPU(void* A_data,
            Dtype dtype,
            const Device& device);
 
+#ifdef BUILD_SYCL_MODULE
+void LUSYCL(void* A_data,
+            void* ipiv_data,
+            int64_t rows,
+            int64_t cols,
+            Dtype dtype,
+            const Device& device);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void LUCUDA(void* A_data,
             void* ipiv_data,
diff --git a/cpp/open3d/core/linalg/LUSYCL.cpp b/cpp/open3d/core/linalg/LUSYCL.cpp
new file mode 100644
index 00000000000..9368270c685
--- /dev/null
+++ b/cpp/open3d/core/linalg/LUSYCL.cpp
@@ -0,0 +1,42 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/Blob.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/LUImpl.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+
+namespace open3d {
+namespace core {
+
+void LUSYCL(void* A_data,
+            void* ipiv_data,
+            int64_t m,
+            int64_t n,
+            Dtype dtype,
+            const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    int64_t lda = m;
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        // Use blob to ensure cleanup of scratchpad memory.
+        int64_t scratchpad_size =
+                lapack::getrf_scratchpad_size<scalar_t>(queue, m, n, lda);
+        core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device);
+        lapack::getrf(queue, m, n, static_cast<scalar_t*>(A_data), lda,
+                      static_cast<int64_t*>(ipiv_data),
+                      static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                      scratchpad_size)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/LeastSquares.cpp b/cpp/open3d/core/linalg/LeastSquares.cpp
index 6340331c5ce..46c520215e1 100644
--- a/cpp/open3d/core/linalg/LeastSquares.cpp
+++ b/cpp/open3d/core/linalg/LeastSquares.cpp
@@ -63,6 +63,12 @@ void LeastSquares(const Tensor &A, const Tensor &B, Tensor &X) {
         LeastSquaresCUDA(A_data, B_data, m, n, k, dtype, device);
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        LeastSquaresSYCL(A_data, B_data, m, n, k, dtype, device);
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         LeastSquaresCPU(A_data, B_data, m, n, k, dtype, device);
diff --git a/cpp/open3d/core/linalg/LeastSquares.h b/cpp/open3d/core/linalg/LeastSquares.h
index c2e79935f5d..aedc33a5a39 100644
--- a/cpp/open3d/core/linalg/LeastSquares.h
+++ b/cpp/open3d/core/linalg/LeastSquares.h
@@ -25,6 +25,16 @@ void LeastSquaresCUDA(void* A_data,
                       const Device& device);
 #endif
 
+#ifdef BUILD_SYCL_MODULE
+void LeastSquaresSYCL(void* A_data,
+                      void* B_data,
+                      int64_t m,
+                      int64_t n,
+                      int64_t k,
+                      Dtype dtype,
+                      const Device& device);
+#endif
+
 void LeastSquaresCPU(void* A_data,
                      void* B_data,
                      int64_t m,
diff --git a/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp b/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp
new file mode 100644
index 00000000000..faa9dfb3307
--- /dev/null
+++ b/cpp/open3d/core/linalg/LeastSquaresSYCL.cpp
@@ -0,0 +1,46 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/Blob.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/LeastSquares.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+
+namespace open3d {
+namespace core {
+
+void LeastSquaresSYCL(void* A_data,
+                      void* B_data,
+                      int64_t m,
+                      int64_t n,
+                      int64_t k,
+                      Dtype dtype,
+                      const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    int nrhs = k, lda = m, stride_a = lda * n, ldb = std::max(m, n),
+        stride_b = ldb * nrhs, batch_size = 1;
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        // Use blob to ensure cleanup of scratchpad memory.
+        int64_t scratchpad_size = lapack::gels_batch_scratchpad_size<scalar_t>(
+                queue, transpose::N, m, n, nrhs, lda, stride_a, ldb, stride_b,
+                batch_size);
+        core::Blob scratchpad(scratchpad_size * sizeof(scalar_t), device);
+        lapack::gels_batch(
+                queue, transpose::N, m, n, nrhs, static_cast<scalar_t*>(A_data),
+                lda, stride_a, static_cast<scalar_t*>(B_data), ldb, stride_b,
+                batch_size, static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                scratchpad_size)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/Matmul.cpp b/cpp/open3d/core/linalg/Matmul.cpp
index 1b616c5f335..012a116b736 100644
--- a/cpp/open3d/core/linalg/Matmul.cpp
+++ b/cpp/open3d/core/linalg/Matmul.cpp
@@ -65,7 +65,13 @@ void Matmul(const Tensor& A, const Tensor& B, Tensor& output) {
     output = Tensor::Empty({m, n}, dtype, device);
     void* C_data = output.GetDataPtr();
 
-    if (device.IsCUDA()) {
+    if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        MatmulSYCL(B_data, A_data, C_data, n, k, m, dtype, device);
+#else
+        utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         CUDAScopedDevice scoped_device(device);
         MatmulCUDA(B_data, A_data, C_data, n, k, m, dtype, device);
diff --git a/cpp/open3d/core/linalg/Matmul.h b/cpp/open3d/core/linalg/Matmul.h
index da29240b0c4..eeaba408591 100644
--- a/cpp/open3d/core/linalg/Matmul.h
+++ b/cpp/open3d/core/linalg/Matmul.h
@@ -15,6 +15,16 @@ namespace core {
 /// Computes matrix multiplication C = AB.
 void Matmul(const Tensor& A, const Tensor& B, Tensor& C);
 
+#ifdef BUILD_SYCL_MODULE
+void MatmulSYCL(void* A_data,
+                void* B_data,
+                void* C_data,
+                int64_t m,
+                int64_t k,
+                int64_t n,
+                Dtype dtype,
+                const Device& device);
+#endif
 #ifdef BUILD_CUDA_MODULE
 void MatmulCUDA(void* A_data,
                 void* B_data,
diff --git a/cpp/open3d/core/linalg/MatmulSYCL.cpp b/cpp/open3d/core/linalg/MatmulSYCL.cpp
new file mode 100644
index 00000000000..be2d6f94ac7
--- /dev/null
+++ b/cpp/open3d/core/linalg/MatmulSYCL.cpp
@@ -0,0 +1,39 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/AddMM.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+#include "open3d/utility/Logging.h"
+
+namespace open3d {
+namespace core {
+void MatmulSYCL(void* A_data,
+                void* B_data,
+                void* C_data,
+                int64_t m,
+                int64_t k,
+                int64_t n,
+                Dtype dtype,
+                const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        scalar_t alpha = 1, beta = 0;
+        blas::column_major::gemm(queue, transpose::N, transpose::N, m, n, k,
+                                 alpha, static_cast<const scalar_t*>(A_data), m,
+                                 static_cast<const scalar_t*>(B_data), k, beta,
+                                 static_cast<scalar_t*>(C_data), m)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/SVD.cpp b/cpp/open3d/core/linalg/SVD.cpp
index 935d014ef4e..657e79f9c57 100644
--- a/cpp/open3d/core/linalg/SVD.cpp
+++ b/cpp/open3d/core/linalg/SVD.cpp
@@ -40,23 +40,31 @@ void SVD(const Tensor &A, Tensor &U, Tensor &S, Tensor &VT) {
     U = Tensor::Empty({m, m}, dtype, device);
     S = Tensor::Empty({n}, dtype, device);
     VT = Tensor::Empty({n, n}, dtype, device);
-    Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device);
 
     void *A_data = A_T.GetDataPtr();
     void *U_data = U.GetDataPtr();
     void *S_data = S.GetDataPtr();
     void *VT_data = VT.GetDataPtr();
-    void *superb_data = superb.GetDataPtr();
 
-    if (device.IsCUDA()) {
+    if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        SVDSYCL(A_data, U_data, S_data, VT_data, m, n, dtype, device);
+#else
+        utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         CUDAScopedDevice scoped_device(device);
+        Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device);
+        void *superb_data = superb.GetDataPtr();
         SVDCUDA(A_data, U_data, S_data, VT_data, superb_data, m, n, dtype,
                 device);
 #else
         utility::LogError("Unimplemented device.");
 #endif
     } else {
+        Tensor superb = Tensor::Empty({std::min(m, n) - 1}, dtype, device);
+        void *superb_data = superb.GetDataPtr();
         SVDCPU(A_data, U_data, S_data, VT_data, superb_data, m, n, dtype,
                device);
     }
diff --git a/cpp/open3d/core/linalg/SVD.h b/cpp/open3d/core/linalg/SVD.h
index 9e6deefa6c1..51d775d7a9d 100644
--- a/cpp/open3d/core/linalg/SVD.h
+++ b/cpp/open3d/core/linalg/SVD.h
@@ -16,6 +16,17 @@ namespace core {
 /// is a min(m, n), VT is an n x n tensor.
 void SVD(const Tensor& A, Tensor& U, Tensor& S, Tensor& VT);
 
+#ifdef BUILD_SYCL_MODULE
+void SVDSYCL(const void* A_data,
+             void* U_data,
+             void* S_data,
+             void* VT_data,
+             int64_t m,
+             int64_t n,
+             Dtype dtype,
+             const Device& device);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void SVDCUDA(const void* A_data,
              void* U_data,
diff --git a/cpp/open3d/core/linalg/SVDSYCL.cpp b/cpp/open3d/core/linalg/SVDSYCL.cpp
new file mode 100644
index 00000000000..27abf31728f
--- /dev/null
+++ b/cpp/open3d/core/linalg/SVDSYCL.cpp
@@ -0,0 +1,48 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/Blob.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+#include "open3d/core/linalg/SVD.h"
+
+namespace open3d {
+namespace core {
+
+void SVDSYCL(const void* A_data,
+             void* U_data,
+             void* S_data,
+             void* VT_data,
+             int64_t m,
+             int64_t n,
+             Dtype dtype,
+             const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        int64_t lda = m, ldvt = n, ldu = m;
+        int64_t scratchpad_size = lapack::gesvd_scratchpad_size<scalar_t>(
+                queue, jobsvd::vectors, jobsvd::vectors, m, n, lda, ldu, ldvt);
+        // Use blob to ensure cleanup of scratchpad memory.
+        Blob scratchpad(scratchpad_size * sizeof(scalar_t), device);
+        lapack::gesvd(
+                queue, jobsvd::vectors, jobsvd::vectors, m, n,
+                const_cast<scalar_t*>(static_cast<const scalar_t*>(A_data)),
+                lda, static_cast<scalar_t*>(S_data),
+                static_cast<scalar_t*>(U_data), ldu,
+                static_cast<scalar_t*>(VT_data), ldvt,
+                static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                scratchpad_size)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/Solve.cpp b/cpp/open3d/core/linalg/Solve.cpp
index f1025520fac..ec09e8be3f1 100644
--- a/cpp/open3d/core/linalg/Solve.cpp
+++ b/cpp/open3d/core/linalg/Solve.cpp
@@ -60,7 +60,16 @@ void Solve(const Tensor &A, const Tensor &B, Tensor &X) {
     X = B.T().Clone();
     void *B_data = X.GetDataPtr();
 
-    if (device.IsCUDA()) {
+    if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        Tensor ipiv = Tensor::Empty({n}, core::Int64, device);
+        void *ipiv_data = ipiv.GetDataPtr();
+
+        SolveSYCL(A_data, B_data, ipiv_data, n, k, dtype, device);
+#else
+        utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsCUDA()) {
 #ifdef BUILD_CUDA_MODULE
         CUDAScopedDevice scoped_device(device);
         Tensor ipiv = Tensor::Empty({n}, core::Int32, device);
diff --git a/cpp/open3d/core/linalg/Solve.h b/cpp/open3d/core/linalg/Solve.h
index 485de7ef0f2..a299c100ca7 100644
--- a/cpp/open3d/core/linalg/Solve.h
+++ b/cpp/open3d/core/linalg/Solve.h
@@ -23,6 +23,16 @@ void SolveCPU(void* A_data,
               Dtype dtype,
               const Device& device);
 
+#ifdef BUILD_SYCL_MODULE
+void SolveSYCL(void* A_data,
+               void* B_data,
+               void* ipiv_data,
+               int64_t n,
+               int64_t k,
+               Dtype dtype,
+               const Device& device);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void SolveCUDA(void* A_data,
                void* B_data,
diff --git a/cpp/open3d/core/linalg/SolveSYCL.cpp b/cpp/open3d/core/linalg/SolveSYCL.cpp
new file mode 100644
index 00000000000..38d6690ff89
--- /dev/null
+++ b/cpp/open3d/core/linalg/SolveSYCL.cpp
@@ -0,0 +1,44 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include <sycl/sycl.hpp>
+
+#include "oneapi/mkl.hpp"
+#include "open3d/core/Blob.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/linalg/LinalgUtils.h"
+#include "open3d/core/linalg/Solve.h"
+
+namespace open3d {
+namespace core {
+
+void SolveSYCL(void* A_data,
+               void* B_data,
+               void* ipiv_data,
+               int64_t n,
+               int64_t k,
+               Dtype dtype,
+               const Device& device) {
+    using namespace oneapi::mkl;
+    sycl::queue queue = sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+    int64_t nrhs = k, lda = n, ldb = n;
+    DISPATCH_LINALG_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        int64_t scratchpad_size = lapack::gesv_scratchpad_size<scalar_t>(
+                queue, n, nrhs, lda, ldb);
+        // Use blob to ensure cleanup of scratchpad memory.
+        Blob scratchpad(scratchpad_size * sizeof(scalar_t), device);
+        lapack::gesv(queue, n, nrhs, static_cast<scalar_t*>(A_data), lda,
+                     static_cast<int64_t*>(ipiv_data),
+                     static_cast<scalar_t*>(B_data), ldb,
+                     static_cast<scalar_t*>(scratchpad.GetDataPtr()),
+                     scratchpad_size)
+                .wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/linalg/Tri.cpp b/cpp/open3d/core/linalg/Tri.cpp
index 77e72b076a2..b15b864aa6a 100644
--- a/cpp/open3d/core/linalg/Tri.cpp
+++ b/cpp/open3d/core/linalg/Tri.cpp
@@ -42,6 +42,12 @@ void Triu(const Tensor& A, Tensor& output, const int diagonal) {
         TriuCUDA(A.Contiguous(), output, diagonal);
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        TriuSYCL(A.Contiguous(), output, diagonal);
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         TriuCPU(A.Contiguous(), output, diagonal);
@@ -58,6 +64,12 @@ void Tril(const Tensor& A, Tensor& output, const int diagonal) {
         TrilCUDA(A.Contiguous(), output, diagonal);
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        TrilSYCL(A.Contiguous(), output, diagonal);
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         TrilCPU(A.Contiguous(), output, diagonal);
@@ -75,6 +87,12 @@ void Triul(const Tensor& A, Tensor& upper, Tensor& lower, const int diagonal) {
         TriulCUDA(A.Contiguous(), upper, lower, diagonal);
 #else
         utility::LogError("Unimplemented device.");
+#endif
+    } else if (device.IsSYCL()) {
+#ifdef BUILD_SYCL_MODULE
+        TriulSYCL(A.Contiguous(), upper, lower, diagonal);
+#else
+        utility::LogError("Unimplemented device.");
 #endif
     } else {
         TriulCPU(A.Contiguous(), upper, lower, diagonal);
diff --git a/cpp/open3d/core/linalg/TriImpl.h b/cpp/open3d/core/linalg/TriImpl.h
index 441d9c69c93..db7b204a87c 100644
--- a/cpp/open3d/core/linalg/TriImpl.h
+++ b/cpp/open3d/core/linalg/TriImpl.h
@@ -22,6 +22,17 @@ void TriulCPU(const Tensor& A,
               Tensor& lower,
               const int diagonal = 0);
 
+#ifdef BUILD_SYCL_MODULE
+void TriuSYCL(const Tensor& A, Tensor& output, const int diagonal = 0);
+
+void TrilSYCL(const Tensor& A, Tensor& output, const int diagonal = 0);
+
+void TriulSYCL(const Tensor& A,
+               Tensor& upper,
+               Tensor& lower,
+               const int diagonal = 0);
+#endif
+
 #ifdef BUILD_CUDA_MODULE
 void TriuCUDA(const Tensor& A, Tensor& output, const int diagonal = 0);
 
diff --git a/cpp/open3d/core/linalg/TriSYCL.cpp b/cpp/open3d/core/linalg/TriSYCL.cpp
new file mode 100644
index 00000000000..3d10f99efc3
--- /dev/null
+++ b/cpp/open3d/core/linalg/TriSYCL.cpp
@@ -0,0 +1,82 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/SYCLContext.h"
+#include "open3d/core/Tensor.h"
+#include "open3d/core/linalg/TriImpl.h"
+
+namespace open3d {
+namespace core {
+
+void TriuSYCL(const Tensor &A, Tensor &output, const int diagonal) {
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice());
+    DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() {
+        const scalar_t *A_ptr = static_cast<const scalar_t *>(A.GetDataPtr());
+        scalar_t *output_ptr = static_cast<scalar_t *>(output.GetDataPtr());
+        auto rows = static_cast<size_t>(A.GetShape()[0]),
+             cols = static_cast<size_t>(A.GetShape()[1]);
+        queue.parallel_for({cols, rows}, [=](auto wid) {
+                 const auto wid_1d = wid[1] * cols + wid[0];
+                 if (static_cast<int>(wid[0]) - static_cast<int>(wid[1]) >=
+                     diagonal) {
+                     output_ptr[wid_1d] = A_ptr[wid_1d];
+                 }
+             }).wait_and_throw();
+    });
+}
+
+void TrilSYCL(const Tensor &A, Tensor &output, const int diagonal) {
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice());
+    DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() {
+        const scalar_t *A_ptr = static_cast<const scalar_t *>(A.GetDataPtr());
+        scalar_t *output_ptr = static_cast<scalar_t *>(output.GetDataPtr());
+        auto rows = static_cast<size_t>(A.GetShape()[0]),
+             cols = static_cast<size_t>(A.GetShape()[1]);
+        queue.parallel_for({cols, rows}, [=](auto wid) {
+                 const auto wid_1d = wid[1] * cols + wid[0];
+                 if (static_cast<int>(wid[0]) - static_cast<int>(wid[1]) <=
+                     diagonal) {
+                     output_ptr[wid_1d] = A_ptr[wid_1d];
+                 }
+             }).wait_and_throw();
+    });
+}
+
+void TriulSYCL(const Tensor &A,
+               Tensor &upper,
+               Tensor &lower,
+               const int diagonal) {
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(A.GetDevice());
+    DISPATCH_DTYPE_TO_TEMPLATE(A.GetDtype(), [&]() {
+        const scalar_t *A_ptr = static_cast<const scalar_t *>(A.GetDataPtr());
+        scalar_t *upper_ptr = static_cast<scalar_t *>(upper.GetDataPtr());
+        scalar_t *lower_ptr = static_cast<scalar_t *>(lower.GetDataPtr());
+        auto rows = static_cast<size_t>(A.GetShape()[0]),
+             cols = static_cast<size_t>(A.GetShape()[1]);
+        queue.parallel_for({cols, rows}, [=](auto wid) {
+                 const auto wid_1d = wid[1] * cols + wid[0];
+                 if (static_cast<int>(wid[0]) - static_cast<int>(wid[1]) <
+                     diagonal) {
+                     lower_ptr[wid_1d] = A_ptr[wid_1d];
+                 } else if (static_cast<int>(wid[0]) -
+                                    static_cast<int>(wid[1]) >
+                            diagonal) {
+                     upper_ptr[wid_1d] = A_ptr[wid_1d];
+                 } else {
+                     lower_ptr[wid_1d] = 1;
+                     upper_ptr[wid_1d] = A_ptr[wid_1d];
+                 }
+             }).wait_and_throw();
+    });
+}
+
+}  // namespace core
+}  // namespace open3d
diff --git a/cpp/open3d/core/nns/NearestNeighborSearch.h b/cpp/open3d/core/nns/NearestNeighborSearch.h
index 98898cdd7b2..2996f8a6caa 100644
--- a/cpp/open3d/core/nns/NearestNeighborSearch.h
+++ b/cpp/open3d/core/nns/NearestNeighborSearch.h
@@ -27,12 +27,14 @@ class NearestNeighborSearch {
     /// Constructor.
     ///
     /// \param dataset_points Dataset points for constructing search index. Must
-    /// be 2D, with shape {n, d}.
+    /// be 2D, with shape {n, d}. SYCL tensors are not yet supported.
     // NearestNeighborSearch(const Tensor &dataset_points)
     //     : dataset_points_(dataset_points){};
     NearestNeighborSearch(const Tensor &dataset_points,
                           const Dtype &index_dtype = core::Int32)
-        : dataset_points_(dataset_points), index_dtype_(index_dtype){};
+        : dataset_points_(dataset_points), index_dtype_(index_dtype) {
+        AssertNotSYCL(dataset_points_);
+    };
     ~NearestNeighborSearch();
     NearestNeighborSearch(const NearestNeighborSearch &) = delete;
     NearestNeighborSearch &operator=(const NearestNeighborSearch &) = delete;
diff --git a/cpp/open3d/t/geometry/TriangleMesh.cpp b/cpp/open3d/t/geometry/TriangleMesh.cpp
index 17461b7214c..938f7333d49 100644
--- a/cpp/open3d/t/geometry/TriangleMesh.cpp
+++ b/cpp/open3d/t/geometry/TriangleMesh.cpp
@@ -1447,6 +1447,11 @@ Image TriangleMesh::ProjectImagesToAlbedo(
         const std::vector<core::Tensor> &extrinsic_matrices,
         int tex_size /*=1024*/,
         bool update_material /*=true*/) {
+    if (!GetDevice().IsCPU() || !Image::HAVE_IPP) {
+        utility::LogError(
+                "ProjectImagesToAlbedo is only supported on x86_64 CPU "
+                "devices.");
+    }
     using core::None;
     using tk = core::TensorKey;
     constexpr float EPS = 1e-6;
@@ -1480,7 +1485,7 @@ Image TriangleMesh::ProjectImagesToAlbedo(
             tex_size, {"positions"}, 1, 0, false)["positions"];
     core::Tensor albedo =
             core::Tensor::Zeros({tex_size, tex_size, 4}, core::Float32);
-    albedo.Slice(2, 3, 4).Fill(EPS);  // regularize
+    albedo.Slice(2, 3, 4).Fill(EPS);  // regularize weight
     std::mutex albedo_mutex;
 
     RaycastingScene rcs;
@@ -1581,11 +1586,11 @@ Image TriangleMesh::ProjectImagesToAlbedo(
         // C. Interpolate weighted image to weighted texture
         // albedo[u,v] = image[ i[u,v], j[u,v] ]
         this_albedo[widx].Fill(0.f);
-        ipp::Remap(weighted_image[widx], /*{height, width, 4} f32*/
-                   uv2xy2[0],            /* {texsz, texsz} f32*/
-                   uv2xy2[1],            /* {texsz, texsz} f32*/
-                   this_albedo[widx],    /*{texsz, texsz, 4} f32*/
-                   t::geometry::Image::InterpType::Linear);
+        IPP_CALL(ipp::Remap, weighted_image[widx], /*{height, width, 4} f32*/
+                 uv2xy2[0],                        /* {texsz, texsz} f32*/
+                 uv2xy2[1],                        /* {texsz, texsz} f32*/
+                 this_albedo[widx],                /*{texsz, texsz, 4} f32*/
+                 t::geometry::Image::InterpType::Linear);
         // Weights can become negative with higher order interpolation
 
         std::unique_lock<std::mutex> albedo_lock{albedo_mutex};
diff --git a/cpp/open3d/t/geometry/TriangleMesh.h b/cpp/open3d/t/geometry/TriangleMesh.h
index b1bd980f8b6..a517a28c7c6 100644
--- a/cpp/open3d/t/geometry/TriangleMesh.h
+++ b/cpp/open3d/t/geometry/TriangleMesh.h
@@ -1006,6 +1006,8 @@ class TriangleMesh : public Geometry, public DrawableGeometry {
     /// albedo. For best results, use images captured with exposure and white
     /// balance lock to reduce the chance of seams in the output texture.
     ///
+    /// This function is only supported on the CPU device.
+    ///
     /// \param images vector of images.
     /// \param intrinsic_matrices vector of {3,3} intrinsic matrices describing
     /// the pinhole camera.
diff --git a/cpp/open3d/t/geometry/kernel/CMakeLists.txt b/cpp/open3d/t/geometry/kernel/CMakeLists.txt
index aa651596b6d..acc3ddc7e33 100644
--- a/cpp/open3d/t/geometry/kernel/CMakeLists.txt
+++ b/cpp/open3d/t/geometry/kernel/CMakeLists.txt
@@ -1,4 +1,6 @@
 open3d_ispc_add_library(tgeometry_kernel OBJECT)
+set_target_properties(tgeometry_kernel PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+
 
 target_sources(tgeometry_kernel PRIVATE
     Image.cpp
diff --git a/cpp/open3d/t/geometry/kernel/IPPImage.cpp b/cpp/open3d/t/geometry/kernel/IPPImage.cpp
index 1535673db30..5b6474757f2 100644
--- a/cpp/open3d/t/geometry/kernel/IPPImage.cpp
+++ b/cpp/open3d/t/geometry/kernel/IPPImage.cpp
@@ -28,7 +28,6 @@
 
 #include "open3d/core/Dispatch.h"
 #include "open3d/core/Dtype.h"
-#include "open3d/core/ParallelFor.h"
 #include "open3d/core/ShapeUtil.h"
 #include "open3d/core/Tensor.h"
 #include "open3d/t/geometry/Image.h"
diff --git a/cpp/open3d/t/io/ImageIO.h b/cpp/open3d/t/io/ImageIO.h
index 44a1a9d6a66..6b50af60eb5 100644
--- a/cpp/open3d/t/io/ImageIO.h
+++ b/cpp/open3d/t/io/ImageIO.h
@@ -47,7 +47,7 @@ constexpr int kOpen3DImageIODefaultQuality = -1;
 ///
 /// Supported file extensions are png, jpg/jpeg. Data type and number of
 /// channels depends on the file extension.
-/// - PNG: Dtype should be one of core::UInt8, core::UInt16
+/// - PNG: Dtype should be one of core::Bool, core::UInt8, core::UInt16
 ///        Supported number of channels are 1, 3, and 4.
 /// - JPG: Dtyppe should be core::UInt8
 ///        Supported number of channels are 1 and 3.
diff --git a/cpp/open3d/utility/CompilerInfo.cpp b/cpp/open3d/utility/CompilerInfo.cpp
index ecd538ce92d..f55e4d666de 100644
--- a/cpp/open3d/utility/CompilerInfo.cpp
+++ b/cpp/open3d/utility/CompilerInfo.cpp
@@ -44,12 +44,19 @@ std::string CompilerInfo::CUDACompilerVersion() const {
 
 void CompilerInfo::Print() const {
 #ifdef BUILD_CUDA_MODULE
-    utility::LogInfo("CompilerInfo: C++ {}, {} {}, {} {}.", CXXStandard(),
-                     CXXCompilerId(), CXXCompilerVersion(), CUDACompilerId(),
-                     CUDACompilerVersion());
+    utility::LogInfo("CompilerInfo: C++ {}, {} {}, {} {}, SYCL disabled.",
+                     CXXStandard(), CXXCompilerId(), CXXCompilerVersion(),
+                     CUDACompilerId(), CUDACompilerVersion());
 #else
-    utility::LogInfo("CompilerInfo: C++ {}, {} {}, CUDA disabled.",
-                     CXXStandard(), CXXCompilerId(), CXXCompilerVersion());
+#ifdef BUILD_SYCL_MODULE
+    utility::LogInfo(
+            "CompilerInfo: C++ {}, {} {}, CUDA disabled, SYCL enabled.",
+            CXXStandard(), CXXCompilerId(), CXXCompilerVersion());
+#else
+    utility::LogInfo(
+            "CompilerInfo: C++ {}, {} {}, CUDA disabled, SYCL disabled",
+            CXXStandard(), CXXCompilerId(), CXXCompilerVersion());
+#endif
 #endif
 }
 
diff --git a/cpp/pybind/CMakeLists.txt b/cpp/pybind/CMakeLists.txt
index 6efae9a17fd..5fdce155f18 100644
--- a/cpp/pybind/CMakeLists.txt
+++ b/cpp/pybind/CMakeLists.txt
@@ -233,6 +233,7 @@ add_custom_target(python-package
             -DOPEN3D_ML_ROOT=${OPEN3D_ML_ROOT}
             -DBUILD_GUI=${BUILD_GUI}
             -DBUILD_CUDA_MODULE=${BUILD_CUDA_MODULE}
+            -DBUILD_SYCL_MODULE=${BUILD_SYCL_MODULE}
             -DGUI_RESOURCE_DIR=${GUI_RESOURCE_DIR}
             -DPROJECT_EMAIL=${PROJECT_EMAIL}
             -DPROJECT_HOMEPAGE_URL=${PROJECT_HOMEPAGE_URL}
diff --git a/cpp/pybind/core/sycl_utils.cpp b/cpp/pybind/core/sycl_utils.cpp
index 9a28dcb5072..dce144e050f 100644
--- a/cpp/pybind/core/sycl_utils.cpp
+++ b/cpp/pybind/core/sycl_utils.cpp
@@ -33,6 +33,11 @@ void pybind_sycl_utils_definitions(py::module& m) {
                "Enables the JIT cache for SYCL. This sets an environment "
                "variable and "
                "will affect the entire process and any child processes.");
+
+    m_sycl.def("get_device_type", sy::GetDeviceType, "device"_a,
+               "Returns the device type (cpu / gpu / accelerator / custom) of "
+               "the specified device as a string. Returns empty string if the "
+               "device is not available.");
 }
 
 }  // namespace core
diff --git a/cpp/pybind/make_python_package.cmake b/cpp/pybind/make_python_package.cmake
index 01e0d5663f5..aa4171414b0 100644
--- a/cpp/pybind/make_python_package.cmake
+++ b/cpp/pybind/make_python_package.cmake
@@ -77,6 +77,7 @@ if (BUNDLE_OPEN3D_ML)
     file(RENAME "${PYTHON_PACKAGE_DST_DIR}/open3d/ml3d" "${PYTHON_PACKAGE_DST_DIR}/open3d/_ml3d")
 endif()
 
+set(requirement_files ${PYTHON_PACKAGE_SRC_DIR}/requirements.txt)
 # Build Jupyter plugin.
 if (BUILD_JUPYTER_EXTENSION)
     if (WIN32 OR UNIX AND NOT LINUX_AARCH64)
@@ -113,17 +114,19 @@ if (BUILD_JUPYTER_EXTENSION)
                             "npm install -g yarn.")
     endif()
 
-    # Append requirements_jupyter_install.txt to requirements.txt
-    # These will be installed when `pip install open3d`.
-    execute_process(COMMAND ${CMAKE_COMMAND} -E cat
-        ${PYTHON_PACKAGE_SRC_DIR}/requirements.txt
-        ${PYTHON_PACKAGE_SRC_DIR}/requirements_jupyter_install.txt
-        OUTPUT_VARIABLE ALL_REQUIREMENTS
-    )
-    # The double-quote "" is important as it keeps the semicolons.
-    file(WRITE ${PYTHON_PACKAGE_DST_DIR}/requirements.txt "${ALL_REQUIREMENTS}")
+    list(APPEND requirement_files 
+        ${PYTHON_PACKAGE_SRC_DIR}/requirements_jupyter_install.txt)
+endif()
+
+if (BUILD_SYCL_MODULE)
+    list(APPEND requirement_files ${PYTHON_PACKAGE_SRC_DIR}/requirements_sycl.txt)
 endif()
 
+# These will be installed when the user does `pip install open3d`.
+ execute_process(COMMAND ${CMAKE_COMMAND} -E cat ${requirement_files}
+        OUTPUT_FILE ${PYTHON_PACKAGE_DST_DIR}/requirements.txt
+    )
+
 if (BUILD_GUI)
     file(MAKE_DIRECTORY "${PYTHON_PACKAGE_DST_DIR}/open3d/resources/")
     file(COPY ${GUI_RESOURCE_DIR}
diff --git a/cpp/pybind/t/geometry/trianglemesh.cpp b/cpp/pybind/t/geometry/trianglemesh.cpp
index e5df69fc48a..b1fed27bc08 100644
--- a/cpp/pybind/t/geometry/trianglemesh.cpp
+++ b/cpp/pybind/t/geometry/trianglemesh.cpp
@@ -1072,6 +1072,8 @@ blended together in the resulting albedo. For best results, use images captured
 with exposure and white balance lock to reduce the chance of seams in the output
 texture.
 
+This function is only supported on the CPU.
+
 Args:
     images (List[open3d.t.geometry.Image]): List of images.
     intrinsic_matrices (List[open3d.core.Tensor]): List of (3,3) intrinsic matrices describing
diff --git a/cpp/tests/core/Blob.cpp b/cpp/tests/core/Blob.cpp
index 85be95d31cc..c55d537acb0 100644
--- a/cpp/tests/core/Blob.cpp
+++ b/cpp/tests/core/Blob.cpp
@@ -16,10 +16,9 @@ namespace open3d {
 namespace tests {
 
 class BlobPermuteDevices : public PermuteDevicesWithSYCL {};
-INSTANTIATE_TEST_SUITE_P(
-        Blob,
-        BlobPermuteDevices,
-        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
+INSTANTIATE_TEST_SUITE_P(Blob,
+                         BlobPermuteDevices,
+                         testing::ValuesIn(BlobPermuteDevices::TestCases()));
 
 TEST_P(BlobPermuteDevices, BlobConstructor) {
     core::Device device = GetParam();
diff --git a/cpp/tests/core/CMakeLists.txt b/cpp/tests/core/CMakeLists.txt
index c2839eacbae..a1c4826d0fa 100644
--- a/cpp/tests/core/CMakeLists.txt
+++ b/cpp/tests/core/CMakeLists.txt
@@ -36,3 +36,12 @@ if (BUILD_ISPC_MODULE)
         ParallelFor.ispc
     )
 endif()
+
+# TODO: cmake does not currently build this test!
+# if (BUILD_SYCL_MODULE)
+#     target_sources(tests PRIVATE
+#         ParallelForSYCL.cpp
+#     )
+#     set_source_files_properties(ParallelForSYCL.cpp PROPERTIES
+#         COMPILE_OPTIONS "-fsycl;-fsycl-targets=spir64_gen")
+# endif()
\ No newline at end of file
diff --git a/cpp/tests/core/CoreTest.cpp b/cpp/tests/core/CoreTest.cpp
index 8f1486ce4c6..850d486656e 100644
--- a/cpp/tests/core/CoreTest.cpp
+++ b/cpp/tests/core/CoreTest.cpp
@@ -16,6 +16,13 @@
 #include "open3d/core/SizeVector.h"
 
 namespace open3d {
+namespace core {
+void PrintTo(const Device &device, std::ostream *os) {
+    *os << device.ToString();
+}
+void PrintTo(const Dtype &dtype, std::ostream *os) { *os << dtype.ToString(); }
+}  // namespace core
+
 namespace tests {
 
 std::vector<core::Dtype> PermuteDtypesWithBool::TestCases() {
@@ -45,7 +52,6 @@ std::vector<core::Device> PermuteDevices::TestCases() {
         devices.push_back(cuda_devices[0]);
         devices.push_back(cuda_devices[1]);
     }
-
     return devices;
 }
 
@@ -53,8 +59,12 @@ std::vector<core::Device> PermuteDevicesWithSYCL::TestCases() {
     std::vector<core::Device> devices = PermuteDevices::TestCases();
     std::vector<core::Device> sycl_devices =
             core::Device::GetAvailableSYCLDevices();
-    if (!sycl_devices.empty()) {
+    // Skip the last SYCL device - this is the CPU fallback and support is
+    // untested.
+    if (sycl_devices.size() > 1) {
         devices.push_back(sycl_devices[0]);
+        // devices.insert(devices.end(), sycl_devices.begin(),
+        // sycl_devices.end());
     }
     return devices;
 }
@@ -85,7 +95,6 @@ PermuteDevicePairs::TestCases() {
             }
         }
     }
-
     return device_pairs;
 }
 
@@ -123,7 +132,6 @@ PermuteDevicePairsWithSYCL::TestCases() {
             }
         }
     }
-
     return device_pairs;
 }
 
diff --git a/cpp/tests/core/CoreTest.h b/cpp/tests/core/CoreTest.h
index ab874b8b485..f8c6f5db273 100644
--- a/cpp/tests/core/CoreTest.h
+++ b/cpp/tests/core/CoreTest.h
@@ -15,6 +15,10 @@
 #include "tests/Tests.h"
 
 namespace open3d {
+namespace core {
+void PrintTo(const Device &device, std::ostream *os);
+void PrintTo(const Dtype &dtype, std::ostream *os);
+}  // namespace core
 namespace tests {
 
 class PermuteDtypesWithBool : public testing::TestWithParam<core::Dtype> {
diff --git a/cpp/tests/core/EigenConverter.cpp b/cpp/tests/core/EigenConverter.cpp
index 7754657cc13..b3c7d085908 100644
--- a/cpp/tests/core/EigenConverter.cpp
+++ b/cpp/tests/core/EigenConverter.cpp
@@ -17,10 +17,11 @@
 namespace open3d {
 namespace tests {
 
-class EigenConverterPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(EigenConverter,
-                         EigenConverterPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class EigenConverterPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        EigenConverter,
+        EigenConverterPermuteDevices,
+        testing::ValuesIn(EigenConverterPermuteDevices::TestCases()));
 
 TEST_P(EigenConverterPermuteDevices, TensorToEigenMatrix) {
     core::Device device = GetParam();
diff --git a/cpp/tests/core/Indexer.cpp b/cpp/tests/core/Indexer.cpp
index 305c08693a0..50a62fee618 100644
--- a/cpp/tests/core/Indexer.cpp
+++ b/cpp/tests/core/Indexer.cpp
@@ -22,10 +22,10 @@
 namespace open3d {
 namespace tests {
 
-class IndexerPermuteDevices : public PermuteDevices {};
+class IndexerPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(Indexer,
                          IndexerPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(IndexerPermuteDevices::TestCases()));
 
 TEST_P(IndexerPermuteDevices, TensorRef) {
     core::Device device = GetParam();
diff --git a/cpp/tests/core/Linalg.cpp b/cpp/tests/core/Linalg.cpp
index 2c8be12323b..3bd9775e534 100644
--- a/cpp/tests/core/Linalg.cpp
+++ b/cpp/tests/core/Linalg.cpp
@@ -11,6 +11,7 @@
 #include "open3d/core/AdvancedIndexing.h"
 #include "open3d/core/Dtype.h"
 #include "open3d/core/MemoryManager.h"
+#include "open3d/core/SYCLUtils.h"
 #include "open3d/core/SizeVector.h"
 #include "open3d/core/Tensor.h"
 #include "open3d/core/kernel/Kernel.h"
@@ -23,10 +24,10 @@
 namespace open3d {
 namespace tests {
 
-class LinalgPermuteDevices : public PermuteDevices {};
+class LinalgPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(Linalg,
                          LinalgPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(LinalgPermuteDevices::TestCases()));
 
 TEST_P(LinalgPermuteDevices, Matmul) {
     const float EPSILON = 1e-8;
@@ -482,6 +483,9 @@ TEST_P(LinalgPermuteDevices, LeastSquares) {
     const float EPSILON = 1e-5;
 
     core::Device device = GetParam();
+    if (core::sy::GetDeviceType(device) == "cpu") {
+        GTEST_SKIP() << "MKL unsupported SYCL device.";
+    }
     core::Dtype dtype = core::Float32;
 
     // Solve test.
diff --git a/cpp/tests/core/MemoryManager.cpp b/cpp/tests/core/MemoryManager.cpp
index 9e3570a5af0..e4017da7e63 100644
--- a/cpp/tests/core/MemoryManager.cpp
+++ b/cpp/tests/core/MemoryManager.cpp
@@ -17,9 +17,10 @@ namespace open3d {
 namespace tests {
 
 class MemoryManagerPermuteDevices : public PermuteDevicesWithSYCL {};
-INSTANTIATE_TEST_SUITE_P(MemoryManager,
-                         MemoryManagerPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+INSTANTIATE_TEST_SUITE_P(
+        MemoryManager,
+        MemoryManagerPermuteDevices,
+        testing::ValuesIn(MemoryManagerPermuteDevices::TestCases()));
 
 class MemoryManagerPermuteDevicePairs : public PermuteDevicePairsWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
diff --git a/cpp/tests/core/ParallelForSYCL.cpp b/cpp/tests/core/ParallelForSYCL.cpp
new file mode 100644
index 00000000000..ff78804faea
--- /dev/null
+++ b/cpp/tests/core/ParallelForSYCL.cpp
@@ -0,0 +1,65 @@
+// ----------------------------------------------------------------------------
+// -                        Open3D: www.open3d.org                            -
+// ----------------------------------------------------------------------------
+// Copyright (c) 2018-2024 www.open3d.org
+// SPDX-License-Identifier: MIT
+// ----------------------------------------------------------------------------
+
+#include "open3d/core/ParallelForSYCL.h"
+
+#include <vector>
+
+#include "open3d/Macro.h"
+#include "open3d/core/Dispatch.h"
+#include "open3d/core/Dtype.h"
+#include "open3d/core/Tensor.h"
+#include "tests/Tests.h"
+#include "tests/core/CoreTest.h"
+
+struct TestIndexerFillKernel {
+    TestFillKernel(const core::Indexer &indexer_, int64_t multiplier_)
+        : indexer(indexer_), multiplier(multiplier_) {}
+    void operator()(int64_t idx) {
+        indexer.GetOutputPtr<int64_t>(0)[idx] = idx * multiplier;
+    }
+
+private:
+    core::Indexer indexer;
+    int64_t multiplier;
+};
+
+struct TestPtrFillKernel {
+    TestFillKernel(int64_t *out_, int64_t multiplier_)
+        : out(out_), multiplier(multiplier_) {}
+    void operator()(int64_t idx) { out[idx] = idx * multiplier; }
+
+private:
+    int64_t *out;
+    int64_t multiplier;
+};
+
+TEST(ParallelForSYCL, FunctorSYCL) {
+    const core::Device device("SYCL:0");
+    const size_t N = 10000000;
+    core::Indexer indexer({}, tensor, DtypePolicy::NONE);
+    int64_t multiplier = 2;
+
+    {
+        core::Tensor tensor({N, 1}, core::Int64, device);
+        core::ParallelForSYCL<TestIndexerFillKernel>(device, indexer,
+                                                     multiplier);
+        auto result = tensor.To(core::Device()).GetDataPtr<int64_t>();
+        for (int64_t i = 0; i < tensor.NumElements(); ++i) {
+            ASSERT_EQ(result[i], i * multiplier);
+        }
+    }
+    {
+        core::Tensor tensor({N, 1}, core::Int64, device);
+        core::ParallelForSYCL<TestPtrFillKernel>(
+                device, N, tensor.GetDataPtr<int64_t>(), multiplier);
+        auto result = tensor.To(core::Device()).GetDataPtr<int64_t>();
+        for (int64_t i = 0; i < tensor.NumElements(); ++i) {
+            ASSERT_EQ(result[i], i * multiplier);
+        }
+    }
+}
\ No newline at end of file
diff --git a/cpp/tests/core/Tensor.cpp b/cpp/tests/core/Tensor.cpp
index 9f788116be7..25d56d24aee 100644
--- a/cpp/tests/core/Tensor.cpp
+++ b/cpp/tests/core/Tensor.cpp
@@ -15,6 +15,7 @@
 #include "open3d/core/AdvancedIndexing.h"
 #include "open3d/core/Dtype.h"
 #include "open3d/core/MemoryManager.h"
+#include "open3d/core/SYCLUtils.h"
 #include "open3d/core/SizeVector.h"
 #include "open3d/core/kernel/Kernel.h"
 #include "open3d/utility/FileSystem.h"
@@ -28,13 +29,13 @@ namespace tests {
 class TensorPermuteDevices : public PermuteDevices {};
 INSTANTIATE_TEST_SUITE_P(Tensor,
                          TensorPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(TensorPermuteDevices::TestCases()));
 
-class TensorPermuteDevicesWithSYCL : public PermuteDevices {};
+class TensorPermuteDevicesWithSYCL : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
         Tensor,
         TensorPermuteDevicesWithSYCL,
-        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
+        testing::ValuesIn(TensorPermuteDevicesWithSYCL::TestCases()));
 
 class TensorPermuteDevicePairs : public PermuteDevicePairs {};
 INSTANTIATE_TEST_SUITE_P(
@@ -95,7 +96,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ConstructorBool) {
     EXPECT_EQ(t.GetDtype(), dtype);
 }
 
-TEST_P(TensorPermuteDevices, WithInitValue) {
+TEST_P(TensorPermuteDevicesWithSYCL, WithInitValue) {
     core::Device device = GetParam();
 
     std::vector<float> vals{0, 1, 2, 3, 4, 5};
@@ -223,7 +224,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, WithInitValueSizeMismatch) {
                  std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Arange) {
+TEST_P(TensorPermuteDevicesWithSYCL, Arange) {
     core::Device device = GetParam();
     core::Tensor arange;
 
@@ -266,28 +267,28 @@ TEST_P(TensorPermuteDevices, Arange) {
                  std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Fill) {
+TEST_P(TensorPermuteDevicesWithSYCL, Fill) {
     core::Device device = GetParam();
     core::Tensor t(std::vector<float>(2 * 3, 0), {2, 3}, core::Float32, device);
     t.Fill(1);
     EXPECT_EQ(t.ToFlatVector<float>(), std::vector<float>({1, 1, 1, 1, 1, 1}));
 }
 
-TEST_P(TensorPermuteDevices, FillBool) {
+TEST_P(TensorPermuteDevicesWithSYCL, FillBool) {
     core::Device device = GetParam();
     core::Tensor t(std::vector<bool>(2 * 3, false), {2, 3}, core::Bool, device);
     t.Fill(true);
     EXPECT_EQ(t.ToFlatVector<bool>(), std::vector<bool>(2 * 3, true));
 }
 
-TEST_P(TensorPermuteDevices, FillSlice) {
+TEST_P(TensorPermuteDevicesWithSYCL, FillSlice) {
     core::Device device = GetParam();
     core::Tensor t(std::vector<float>(2 * 3, 0), {2, 3}, core::Float32, device);
     t.Slice(1, 0, 3, 2).Fill(1);  // t[:, 0:3:2].fill(1)
     EXPECT_EQ(t.ToFlatVector<float>(), std::vector<float>({1, 0, 1, 1, 0, 1}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexSetFillFancy) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSetFillFancy) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -348,7 +349,7 @@ TEST_P(TensorPermuteDevicePairsWithSYCL, CopyBool) {
     EXPECT_EQ(dst_t.ToFlatVector<bool>(), vals);
 }
 
-TEST_P(TensorPermuteDevices, To) {
+TEST_P(TensorPermuteDevicesWithSYCL, To) {
     core::Device device = GetParam();
     core::SizeVector shape{2, 3};
 
@@ -364,7 +365,7 @@ TEST_P(TensorPermuteDevices, To) {
     EXPECT_EQ(dst_t.ToFlatVector<int>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevicePairs, ToDevice) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, ToDevice) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -382,7 +383,7 @@ TEST_P(TensorPermuteDevicePairs, ToDevice) {
     EXPECT_ANY_THROW(src_t.To(core::Device("CUDA:100000")));
 }
 
-TEST_P(TensorPermuteDevicePairs, CopyBroadcast) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, CopyBroadcast) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -403,7 +404,7 @@ TEST_P(TensorPermuteDevicePairs, CopyBroadcast) {
     EXPECT_EQ(dst_t.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, Expand) {
+TEST_P(TensorPermuteDevicesWithSYCL, Expand) {
     core::Device device = GetParam();
     core::Dtype dtype(core::Float32);
 
@@ -423,7 +424,7 @@ TEST_P(TensorPermuteDevices, Expand) {
     EXPECT_EQ(dst_t.GetDataPtr(), src_t.GetDataPtr());
 }
 
-TEST_P(TensorPermuteDevices, Flatten) {
+TEST_P(TensorPermuteDevicesWithSYCL, Flatten) {
     core::Device device = GetParam();
 
     // Flatten 0-D Tensor.
@@ -562,7 +563,7 @@ TEST_P(TensorPermuteSizesDefaultStridesAndDevices, DefaultStrides) {
     EXPECT_EQ(t.GetStrides(), expected_strides);
 }
 
-TEST_P(TensorPermuteDevices, OperatorSquareBrackets) {
+TEST_P(TensorPermuteDevicesWithSYCL, OperatorSquareBrackets) {
     core::Device device = GetParam();
 
     // Zero dim
@@ -623,7 +624,7 @@ TEST_P(TensorPermuteDevices, OperatorSquareBrackets) {
     EXPECT_EQ(t_1_2_3.GetBlob(), t.GetBlob());
 }
 
-TEST_P(TensorPermuteDevices, Item) {
+TEST_P(TensorPermuteDevicesWithSYCL, Item) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -645,7 +646,7 @@ TEST_P(TensorPermuteDevices, Item) {
     EXPECT_EQ(t_1_2_3.Item<float>(), 23.f);
 }
 
-TEST_P(TensorPermuteDevices, ItemBool) {
+TEST_P(TensorPermuteDevicesWithSYCL, ItemBool) {
     core::Device device = GetParam();
 
     std::vector<bool> vals{true, true, false};
@@ -660,7 +661,7 @@ TEST_P(TensorPermuteDevices, ItemBool) {
     EXPECT_EQ(t[2].Item<bool>(), false);
 }
 
-TEST_P(TensorPermuteDevices, ItemAssign) {
+TEST_P(TensorPermuteDevicesWithSYCL, ItemAssign) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Init<float>(
             {{{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}},
@@ -782,7 +783,7 @@ TEST_P(TensorPermuteDevicePairsWithSYCL, CopyContiguous) {
               t_1_copy.GetBlob()->GetDataPtr());  // Points to beginning of Blob
 }
 
-TEST_P(TensorPermuteDevices, Slice) {
+TEST_P(TensorPermuteDevicesWithSYCL, Slice) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -839,7 +840,7 @@ TEST_P(TensorPermuteDevices, Slice) {
     EXPECT_EQ(t_5.ToFlatVector<float>(), std::vector<float>({12, 16}));
 }
 
-TEST_P(TensorPermuteDevices, GetItem) {
+TEST_P(TensorPermuteDevicesWithSYCL, GetItem) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -858,7 +859,7 @@ TEST_P(TensorPermuteDevices, GetItem) {
               std::vector<float>({12, 14, 16, 18, 20, 22}));
 }
 
-TEST_P(TensorPermuteDevices, GetItemAdvancedIndexing) {
+TEST_P(TensorPermuteDevicesWithSYCL, GetItemAdvancedIndexing) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Init<float>(
             {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
@@ -874,7 +875,7 @@ TEST_P(TensorPermuteDevices, GetItemAdvancedIndexing) {
               std::vector<float>({0, 1, 1, 2, 3, 5, 8, 13, 21}));
 }
 
-TEST_P(TensorPermuteDevices, GetItemAdvancedIndexingMixed) {
+TEST_P(TensorPermuteDevicesWithSYCL, GetItemAdvancedIndexingMixed) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -894,7 +895,7 @@ TEST_P(TensorPermuteDevices, GetItemAdvancedIndexingMixed) {
     EXPECT_EQ(t_1.ToFlatVector<float>(), std::vector<float>({13, 17, 14, 18}));
 }
 
-TEST_P(TensorPermuteDevices, SetItemAdvancedIndexing) {
+TEST_P(TensorPermuteDevicesWithSYCL, SetItemAdvancedIndexing) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -914,7 +915,7 @@ TEST_P(TensorPermuteDevices, SetItemAdvancedIndexing) {
                                   16, 17,  18, 19,  20, 21, 22, 23}));
 }
 
-TEST_P(TensorPermuteDevices, SetItemAdvancedIndexingMixed) {
+TEST_P(TensorPermuteDevicesWithSYCL, SetItemAdvancedIndexingMixed) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -937,7 +938,7 @@ TEST_P(TensorPermuteDevices, SetItemAdvancedIndexingMixed) {
                                   16, 200, 400, 19, 20, 21,  22,  23}));
 }
 
-TEST_P(TensorPermuteDevices, SliceAssign) {
+TEST_P(TensorPermuteDevicesWithSYCL, SliceAssign) {
     core::Device device = GetParam();
 
     core::Tensor dst = core::Tensor::Init<float>(
@@ -991,7 +992,7 @@ TEST_P(TensorPermuteDevices, SliceAssign) {
                                   16, 17, 18, 19, 203, 21, 223, 23}));
 }
 
-TEST_P(TensorPermuteDevices, Append) {
+TEST_P(TensorPermuteDevicesWithSYCL, Append) {
     core::Device device = GetParam();
 
     core::Tensor self, other, output;
@@ -1090,7 +1091,7 @@ TEST_P(TensorPermuteDevices, Append) {
     }
 }
 
-TEST_P(TensorPermuteDevicePairs, CopyNonContiguous) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, CopyNonContiguous) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -1127,7 +1128,7 @@ TEST_P(TensorPermuteDevicePairs, CopyNonContiguous) {
     }
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGet) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGet) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1164,7 +1165,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGet) {
     EXPECT_EQ(src_t.GetDtype(), dst_t.GetDtype());
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGetNegative) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetNegative) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1188,7 +1189,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetNegative) {
     EXPECT_EQ(t_1.ToFlatVector<float>(), std::vector<float>({5, 10, 17, 22}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndex) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGet2DBroadcastedIndex) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1218,7 +1219,8 @@ TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndex) {
                                   28, 29, 30, 31, 40, 41, 42, 43}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndexSplitBySlice) {
+TEST_P(TensorPermuteDevicePairsWithSYCL,
+       IndexGet2DBroadcastedIndexSplitBySlice) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1249,7 +1251,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGet2DBroadcastedIndexSplitBySlice) {
                                   16, 20, 40, 44, 17, 21, 41, 45}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGetAssignToBroadcast) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetAssignToBroadcast) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -1279,7 +1281,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetAssignToBroadcast) {
             std::vector<float>({5, 10, 17, 22, 5, 10, 17, 22, 5, 10, 17, 22}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGetSeparateBySlice) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetSeparateBySlice) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1303,7 +1305,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetSeparateBySlice) {
               std::vector<float>({0, 4, 8, 13, 17, 21}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexGetSliceEnd) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexGetSliceEnd) {
     core::Device idx_device;
     core::Device src_device;
     std::tie(idx_device, src_device) = GetParam();
@@ -1326,7 +1328,7 @@ TEST_P(TensorPermuteDevicePairs, IndexGetSliceEnd) {
               std::vector<float>({0, 1, 2, 3, 16, 17, 18, 19}));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexSet) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSet) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -1405,7 +1407,7 @@ TEST_P(TensorPermuteDevicePairs, IndexSet) {
                            core::Tensor::Init<float>({10, 11}, src_device)));
 }
 
-TEST_P(TensorPermuteDevicePairs, IndexSetBroadcast) {
+TEST_P(TensorPermuteDevicePairsWithSYCL, IndexSetBroadcast) {
     core::Device dst_device;
     core::Device src_device;
     std::tie(dst_device, src_device) = GetParam();
@@ -1430,7 +1432,7 @@ TEST_P(TensorPermuteDevicePairs, IndexSetBroadcast) {
                                   0, 0, 0, 0, 20, 20, 20, 0, 0, 0, 0, 0}));
 }
 
-TEST_P(TensorPermuteDevices, IndexAdd_) {
+TEST_P(TensorPermuteDevicesWithSYCL, IndexAdd_) {
     core::Device device = GetParam();
 
     const int tensor_size = 100;
@@ -1464,7 +1466,7 @@ TEST_P(TensorPermuteDevices, IndexAdd_) {
     }
 }
 
-TEST_P(TensorPermuteDevices, Permute) {
+TEST_P(TensorPermuteDevicesWithSYCL, Permute) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -1492,7 +1494,7 @@ TEST_P(TensorPermuteDevices, Permute) {
                                   17, 21, 14, 18, 22, 15, 19, 23}));
 }
 
-TEST_P(TensorPermuteDevices, Transpose) {
+TEST_P(TensorPermuteDevicesWithSYCL, Transpose) {
     core::Device device = GetParam();
 
     core::Tensor t = core::Tensor::Init<float>(
@@ -1512,7 +1514,7 @@ TEST_P(TensorPermuteDevices, Transpose) {
     EXPECT_THROW(t.Transpose(3, 5), std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, T) {
+TEST_P(TensorPermuteDevicesWithSYCL, T) {
     core::Device device = GetParam();
 
     std::vector<float> vals{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
@@ -1532,7 +1534,7 @@ TEST_P(TensorPermuteDevices, T) {
     EXPECT_THROW(t_3d.T(), std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Det) {
+TEST_P(TensorPermuteDevicesWithSYCL, Det) {
     core::Device device = GetParam();
     // Det supports both Float32 and Float64.
     core::Dtype dtype = core::Float32;
@@ -1556,7 +1558,7 @@ TEST_P(TensorPermuteDevices, Det) {
     EXPECT_ANY_THROW(core::Tensor::Ones({3, 4}, dtype, device).Det());
 }
 
-TEST_P(TensorPermuteDevices, ShallowCopyConstructor) {
+TEST_P(TensorPermuteDevicesWithSYCL, ShallowCopyConstructor) {
     core::Device device = GetParam();
     core::Tensor t({2, 3}, core::Float32, device);
 
@@ -1579,7 +1581,7 @@ TEST_P(TensorPermuteDevices, ShallowCopyConstructor) {
     EXPECT_EQ(t.GetDataPtr(), FirstTensorDataPtr({t}));
 }
 
-TEST_P(TensorPermuteDevices, AdvancedIndexing_IsIndexSplittedBySlice) {
+TEST_P(TensorPermuteDevicesWithSYCL, AdvancedIndexing_IsIndexSplittedBySlice) {
     core::Device device = GetParam();
 
     core::Tensor idx = core::Tensor::Init<int64_t>({1, 2}, device);
@@ -1602,7 +1604,7 @@ TEST_P(TensorPermuteDevices, AdvancedIndexing_IsIndexSplittedBySlice) {
             {idx, slice, slice, idx}));
 }
 
-TEST_P(TensorPermuteDevices, Add) {
+TEST_P(TensorPermuteDevicesWithSYCL, Add) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1, 2}, {3, 4, 5}}, device);
     core::Tensor b =
@@ -1612,7 +1614,7 @@ TEST_P(TensorPermuteDevices, Add) {
               std::vector<float>({10, 12, 14, 16, 18, 20}));
 }
 
-TEST_P(TensorPermuteDevices, Add_) {
+TEST_P(TensorPermuteDevicesWithSYCL, Add_) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1, 2}, {3, 4, 5}}, device);
     core::Tensor b =
@@ -1622,7 +1624,7 @@ TEST_P(TensorPermuteDevices, Add_) {
               std::vector<float>({10, 12, 14, 16, 18, 20}));
 }
 
-TEST_P(TensorPermuteDevices, Add_BroadcastException) {
+TEST_P(TensorPermuteDevicesWithSYCL, Add_BroadcastException) {
     // A.shape = (   3, 4)
     // B.shape = (2, 3, 4)
     // A += B should throw exception.
@@ -1642,7 +1644,7 @@ TEST_P(TensorPermuteDevices, Add_BroadcastException) {
                                   20, 22, 24, 26, 28, 30, 32, 34}));
 }
 
-TEST_P(TensorPermuteDevices, Sub) {
+TEST_P(TensorPermuteDevicesWithSYCL, Sub) {
     core::Device device = GetParam();
     core::Tensor a =
             core::Tensor::Init<float>({{10, 12, 14}, {16, 18, 20}}, device);
@@ -1652,7 +1654,7 @@ TEST_P(TensorPermuteDevices, Sub) {
               std::vector<float>({10, 11, 12, 13, 14, 15}));
 }
 
-TEST_P(TensorPermuteDevices, Sub_) {
+TEST_P(TensorPermuteDevicesWithSYCL, Sub_) {
     core::Device device = GetParam();
     core::Tensor a =
             core::Tensor::Init<float>({{10, 12, 14}, {16, 18, 20}}, device);
@@ -1662,7 +1664,7 @@ TEST_P(TensorPermuteDevices, Sub_) {
               std::vector<float>({10, 11, 12, 13, 14, 15}));
 }
 
-TEST_P(TensorPermuteDevices, Mul) {
+TEST_P(TensorPermuteDevicesWithSYCL, Mul) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1, 2}, {3, 4, 5}}, device);
     core::Tensor b =
@@ -1672,7 +1674,7 @@ TEST_P(TensorPermuteDevices, Mul) {
               std::vector<float>({0, 7, 16, 27, 40, 55}));
 }
 
-TEST_P(TensorPermuteDevices, Mul_) {
+TEST_P(TensorPermuteDevicesWithSYCL, Mul_) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1, 2}, {3, 4, 5}}, device);
     core::Tensor b =
@@ -1682,27 +1684,31 @@ TEST_P(TensorPermuteDevices, Mul_) {
               std::vector<float>({0, 7, 16, 27, 40, 55}));
 }
 
-TEST_P(TensorPermuteDevices, Div) {
+TEST_P(TensorPermuteDevicesWithSYCL, Div) {
     core::Device device = GetParam();
     core::Tensor a =
             core::Tensor::Init<float>({{0, 7, 16}, {27, 40, 55}}, device);
     core::Tensor b =
             core::Tensor::Init<float>({{6, 7, 8}, {9, 10, 11}}, device);
     core::Tensor c = a / b;
-    EXPECT_EQ(c.ToFlatVector<float>(), std::vector<float>({0, 1, 2, 3, 4, 5}));
+    core::Tensor c_ref = core::Tensor(std::vector<float>{0, 1, 2, 3, 4, 5},
+                                      {2, 3}, core::Float32, device);
+    EXPECT_TRUE(c.AllClose(c_ref));
 }
 
-TEST_P(TensorPermuteDevices, Div_) {
+TEST_P(TensorPermuteDevicesWithSYCL, Div_) {
     core::Device device = GetParam();
     core::Tensor a =
             core::Tensor::Init<float>({{0, 7, 16}, {27, 40, 55}}, device);
     core::Tensor b =
             core::Tensor::Init<float>({{6, 7, 8}, {9, 10, 11}}, device);
     a /= b;
-    EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({0, 1, 2, 3, 4, 5}));
+    core::Tensor a_ref = core::Tensor(std::vector<float>{0, 1, 2, 3, 4, 5},
+                                      {2, 3}, core::Float32, device);
+    EXPECT_TRUE(a.AllClose(a_ref));
 }
 
-TEST_P(TensorPermuteDevices, ReduceSumKeepDim) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumKeepDim) {
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({{{22.f, 23.f, 20.f, 9.f},
                                                    {6.f, 14.f, 18.f, 13.f},
@@ -1774,7 +1780,7 @@ TEST_P(TensorPermuteDevices, ReduceSumKeepDim) {
     EXPECT_THROW(src.Sum({2, -1}, true), std::runtime_error);  // Repeated.
 }
 
-TEST_P(TensorPermuteDevices, ReduceSumNotKeepDim) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumNotKeepDim) {
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({{{22.f, 23.f, 20.f, 9.f},
                                                    {6.f, 14.f, 18.f, 13.f},
@@ -1829,7 +1835,7 @@ TEST_P(TensorPermuteDevices, ReduceSumNotKeepDim) {
     EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({276.f}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceSumSpecialShapes) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumSpecialShapes) {
     core::Device device = GetParam();
     core::Tensor src;
     core::Tensor dst;
@@ -1910,22 +1916,23 @@ TEST_P(TensorPermuteDevices, ReduceSumSpecialShapes) {
 
 TEST_P(TensorPermuteDevices, ReduceMultipleOutputsSumLargeArray) {
     core::Device device = GetParam();
-    core::SizeVector shape{3, 7, 8234719};
+    constexpr int64_t large = 8234719;
+    core::SizeVector shape{3, 7, large};
     int64_t size = shape.NumElements();
     std::vector<int> vals(size, 1);
     core::Tensor src(vals, shape, core::Int32, device);
     core::Tensor dst;
 
     dst = src.Sum({}, false);
-    EXPECT_EQ(dst.GetShape(), core::SizeVector({3, 7, 8234719}));
-    EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(3 * 7 * 8234719, 1));
+    EXPECT_EQ(dst.GetShape(), core::SizeVector({3, 7, large}));
+    EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(3 * 7 * large, 1));
 
     dst = src.Sum({0}, false);
-    EXPECT_EQ(dst.GetShape(), core::SizeVector({7, 8234719}));
-    EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(7 * 8234719, 3));
+    EXPECT_EQ(dst.GetShape(), core::SizeVector({7, large}));
+    EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>(7 * large, 3));
 }
 
-TEST_P(TensorPermuteDevices, ReduceSum64bit1D) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSum64bit1D) {
     core::Device device = GetParam();
     // num_bytes = 8 * (2 ^ 28) + 1 = 2 ^ 31 + 1 ~= 2GB
     // max_offsets = num_bytes - 1 = 2 ^ 31
@@ -2038,7 +2045,7 @@ TEST_P(TensorPermuteDevices, ReduceSum64bit2DCase3) {
               std::vector<int64_t>(large_dim - 30, 2));
 }
 
-TEST_P(TensorPermuteDevices, ReduceSumLargeArray) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceSumLargeArray) {
     core::Device device = GetParam();
 
     std::vector<int64_t> sizes = TensorSizes::TestCases();
@@ -2060,7 +2067,7 @@ TEST_P(TensorPermuteDevices, ReduceSumLargeArray) {
     }
 }
 
-TEST_P(TensorPermuteDevices, ReduceProd) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceProd) {
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({{{22.f, 23.f, 20.f, 9.f},
                                                    {6.f, 14.f, 18.f, 13.f},
@@ -2117,7 +2124,7 @@ TEST_P(TensorPermuteDevices, ReduceProd) {
     EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({0.f}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceMin) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceMin) {
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({{{22.f, 23.f, 20.f, 9.f},
                                                    {6.f, 14.f, 18.f, 13.f},
@@ -2170,7 +2177,7 @@ TEST_P(TensorPermuteDevices, ReduceMin) {
     EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({0.f}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceMax) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceMax) {
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({{{22.f, 23.f, 20.f, 9.f},
                                                    {6.f, 14.f, 18.f, 13.f},
@@ -2225,7 +2232,7 @@ TEST_P(TensorPermuteDevices, ReduceMax) {
     EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({23.f}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceMaxFloatLimit) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceMaxFloatLimit) {
     // std::numeric_limits<scalar_t> should use lowest() instead of min().
     core::Device device = GetParam();
     core::Tensor src = core::Tensor::Init<float>({-2.f, -1.f}, device);
@@ -2237,8 +2244,13 @@ TEST_P(TensorPermuteDevices, ReduceMaxFloatLimit) {
     EXPECT_EQ(dst.ToFlatVector<int64_t>(), std::vector<int64_t>({1}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceArgMin) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceArgMin) {
     core::Device device = GetParam();
+    if (core::sy::GetDeviceType(device) == "cpu") {
+        GTEST_SKIP() << "allocateMemSubBuffer() API failed with unknown error "
+                        "on CPU.";
+    }
+
     core::Tensor src = core::Tensor::Init<float>(
             {{{22, 23, 20, 9}, {6, 14, 18, 13}, {15, 3, 17, 0}},
              {{7, 21, 11, 1}, {4, 2, 10, 19}, {5, 8, 16, 12}}},
@@ -2265,8 +2277,12 @@ TEST_P(TensorPermuteDevices, ReduceArgMin) {
               std::vector<int64_t>({3, 0, 3, 3, 1, 0}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceArgMax) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceArgMax) {
     core::Device device = GetParam();
+    if (core::sy::GetDeviceType(device) == "cpu") {
+        GTEST_SKIP() << "allocateMemSubBuffer() API failed with unknown error "
+                        "on CPU.";
+    }
     core::Tensor src = core::Tensor::Init<float>(
             {{{22, 23, 20, 9}, {6, 14, 18, 13}, {15, 3, 17, 0}},
              {{7, 21, 11, 1}, {4, 2, 10, 19}, {5, 8, 16, 12}}},
@@ -2293,7 +2309,7 @@ TEST_P(TensorPermuteDevices, ReduceArgMax) {
               std::vector<int64_t>({1, 2, 2, 1, 3, 2}));
 }
 
-TEST_P(TensorPermuteDevices, Sqrt) {
+TEST_P(TensorPermuteDevicesWithSYCL, Sqrt) {
     core::Device device = GetParam();
     core::Tensor src =
             core::Tensor::Init<float>({{0, 1, 4}, {9, 16, 25}}, device);
@@ -2323,7 +2339,7 @@ TEST_P(TensorPermuteDevices, Sqrt) {
               std::vector<float>({0, 1, 2, 3, 4, 5}));
 }
 
-TEST_P(TensorPermuteDevices, Sin) {
+TEST_P(TensorPermuteDevicesWithSYCL, Sin) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2, -1, 0, 1, 2, 3};
@@ -2346,7 +2362,7 @@ TEST_P(TensorPermuteDevices, Sin) {
     EXPECT_THROW(src.Sin(), std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Cos) {
+TEST_P(TensorPermuteDevicesWithSYCL, Cos) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2, -1, 0, 1, 2, 3};
@@ -2369,7 +2385,7 @@ TEST_P(TensorPermuteDevices, Cos) {
     EXPECT_THROW(src.Cos(), std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Neg) {
+TEST_P(TensorPermuteDevicesWithSYCL, Neg) {
     core::Device device = GetParam();
 
     std::vector<float> dst_vals{2, 1, 0, -1, -2, -3};
@@ -2388,7 +2404,7 @@ TEST_P(TensorPermuteDevices, Neg) {
     EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>({1, 0, -2}));
 }
 
-TEST_P(TensorPermuteDevices, UnaryMinus) {
+TEST_P(TensorPermuteDevicesWithSYCL, UnaryMinus) {
     core::Device device = GetParam();
 
     std::vector<float> dst_vals{2, 1, 0, -1, -2, -3};
@@ -2403,7 +2419,7 @@ TEST_P(TensorPermuteDevices, UnaryMinus) {
     EXPECT_EQ(dst.ToFlatVector<int>(), std::vector<int>({1, 0, -2}));
 }
 
-TEST_P(TensorPermuteDevices, Exp) {
+TEST_P(TensorPermuteDevicesWithSYCL, Exp) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2, -1, 0, 1, 2, 3};
@@ -2426,7 +2442,7 @@ TEST_P(TensorPermuteDevices, Exp) {
     EXPECT_THROW(src.Exp(), std::runtime_error);
 }
 
-TEST_P(TensorPermuteDevices, Abs) {
+TEST_P(TensorPermuteDevicesWithSYCL, Abs) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2, -1, 0, 1, 2, 3};
@@ -2444,7 +2460,7 @@ TEST_P(TensorPermuteDevices, Abs) {
     EXPECT_EQ(src.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, IsNan) {
+TEST_P(TensorPermuteDevicesWithSYCL, IsNan) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY};
@@ -2458,7 +2474,7 @@ TEST_P(TensorPermuteDevices, IsNan) {
     EXPECT_EQ(dst.ToFlatVector<bool>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, IsInf) {
+TEST_P(TensorPermuteDevicesWithSYCL, IsInf) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY};
@@ -2472,7 +2488,7 @@ TEST_P(TensorPermuteDevices, IsInf) {
     EXPECT_EQ(dst.ToFlatVector<bool>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, IsFinite) {
+TEST_P(TensorPermuteDevicesWithSYCL, IsFinite) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-INFINITY, NAN, 0, NAN, 2, INFINITY};
@@ -2486,7 +2502,7 @@ TEST_P(TensorPermuteDevices, IsFinite) {
     EXPECT_EQ(dst.ToFlatVector<bool>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, Floor) {
+TEST_P(TensorPermuteDevicesWithSYCL, Floor) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5};
@@ -2500,7 +2516,7 @@ TEST_P(TensorPermuteDevices, Floor) {
     EXPECT_EQ(dst.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, Ceil) {
+TEST_P(TensorPermuteDevicesWithSYCL, Ceil) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5};
@@ -2514,7 +2530,7 @@ TEST_P(TensorPermuteDevices, Ceil) {
     EXPECT_EQ(dst.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, Round) {
+TEST_P(TensorPermuteDevicesWithSYCL, Round) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5};
@@ -2528,7 +2544,7 @@ TEST_P(TensorPermuteDevices, Round) {
     EXPECT_EQ(dst.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, Trunc) {
+TEST_P(TensorPermuteDevicesWithSYCL, Trunc) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{-2.4, -1.6, 0, 1.4, 2.6, 3.5};
@@ -2542,7 +2558,7 @@ TEST_P(TensorPermuteDevices, Trunc) {
     EXPECT_EQ(dst.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, LogicalNot) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalNot) {
     core::Device device = GetParam();
 
     std::vector<bool> src_vals{true, false, true, false};
@@ -2560,7 +2576,7 @@ TEST_P(TensorPermuteDevices, LogicalNot) {
     EXPECT_EQ(src.ToFlatVector<bool>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, LogicalNotFloat) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalNotFloat) {
     core::Device device = GetParam();
 
     std::vector<float> src_vals{0, -1, 1, 2};
@@ -2584,7 +2600,7 @@ TEST_P(TensorPermuteDevices, LogicalNotFloat) {
     EXPECT_EQ(src.ToFlatVector<float>(), dst_vals);
 }
 
-TEST_P(TensorPermuteDevices, LogicalAnd) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalAnd) {
     core::Device device = GetParam();
     core::Tensor a(std::vector<bool>({true, false, true, false}), {2, 2},
                    core::Bool, device);
@@ -2603,7 +2619,7 @@ TEST_P(TensorPermuteDevices, LogicalAnd) {
               std::vector<bool>({true, false, false, false}));
 }
 
-TEST_P(TensorPermuteDevices, LogicalAndFloat) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalAndFloat) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{-1, 0}, {1, 0}}, device);
     core::Tensor b = core::Tensor::Init<float>({{1, 0}, {0, 0}}, device);
@@ -2616,7 +2632,7 @@ TEST_P(TensorPermuteDevices, LogicalAndFloat) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({1, 0, 0, 0}));
 }
 
-TEST_P(TensorPermuteDevices, LogicalOr) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalOr) {
     core::Device device = GetParam();
     core::Tensor a(std::vector<bool>({true, false, true, false}), {2, 2},
                    core::Bool, device);
@@ -2635,7 +2651,7 @@ TEST_P(TensorPermuteDevices, LogicalOr) {
               std::vector<bool>({true, true, true, false}));
 }
 
-TEST_P(TensorPermuteDevices, LogicalOrFloat) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalOrFloat) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{-1, 0}, {1, 0}}, device);
     core::Tensor b = core::Tensor::Init<float>({{1, -1}, {0, 0}}, device);
@@ -2648,7 +2664,7 @@ TEST_P(TensorPermuteDevices, LogicalOrFloat) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({1, 1, 1, 0}));
 }
 
-TEST_P(TensorPermuteDevices, LogicalXor) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalXor) {
     core::Device device = GetParam();
     core::Tensor a(std::vector<bool>({true, false, true, false}), {2, 2},
                    core::Bool, device);
@@ -2664,7 +2680,7 @@ TEST_P(TensorPermuteDevices, LogicalXor) {
               std::vector<bool>({false, true, true, false}));
 }
 
-TEST_P(TensorPermuteDevices, LogicalXorFloat) {
+TEST_P(TensorPermuteDevicesWithSYCL, LogicalXorFloat) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{-1, 0}, {1, 0}}, device);
     core::Tensor b = core::Tensor::Init<float>({{1, -1}, {0, 0}}, device);
@@ -2677,7 +2693,7 @@ TEST_P(TensorPermuteDevices, LogicalXorFloat) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({0, 1, 1, 0}));
 }
 
-TEST_P(TensorPermuteDevices, Gt) {
+TEST_P(TensorPermuteDevicesWithSYCL, Gt) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
     core::Tensor b = core::Tensor::Init<float>({{0, 0}, {0, 2}}, device);
@@ -2693,7 +2709,7 @@ TEST_P(TensorPermuteDevices, Gt) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({0, 1, 0, 0}));
 }
 
-TEST_P(TensorPermuteDevices, Lt) {
+TEST_P(TensorPermuteDevicesWithSYCL, Lt) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
     core::Tensor b = core::Tensor::Init<float>({{0, 0}, {0, 2}}, device);
@@ -2709,7 +2725,7 @@ TEST_P(TensorPermuteDevices, Lt) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({0, 0, 1, 1}));
 }
 
-TEST_P(TensorPermuteDevices, Ge) {
+TEST_P(TensorPermuteDevicesWithSYCL, Ge) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
     core::Tensor b = core::Tensor::Init<float>({{0, 0}, {0, 2}}, device);
@@ -2725,7 +2741,7 @@ TEST_P(TensorPermuteDevices, Ge) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({1, 1, 0, 0}));
 }
 
-TEST_P(TensorPermuteDevices, Le) {
+TEST_P(TensorPermuteDevicesWithSYCL, Le) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
     core::Tensor b = core::Tensor::Init<float>({{0, 0}, {0, 2}}, device);
@@ -2741,7 +2757,7 @@ TEST_P(TensorPermuteDevices, Le) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({1, 0, 1, 1}));
 }
 
-TEST_P(TensorPermuteDevices, Eq) {
+TEST_P(TensorPermuteDevicesWithSYCL, Eq) {
     core::Device device = GetParam();
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
     core::Tensor b = core::Tensor::Init<float>({{0, 0}, {0, 2}}, device);
@@ -2757,7 +2773,7 @@ TEST_P(TensorPermuteDevices, Eq) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({1, 0, 0, 0}));
 }
 
-TEST_P(TensorPermuteDevices, Ne) {
+TEST_P(TensorPermuteDevicesWithSYCL, Ne) {
     core::Device device = GetParam();
 
     core::Tensor a = core::Tensor::Init<float>({{0, 1}, {-1, 1}}, device);
@@ -2774,7 +2790,7 @@ TEST_P(TensorPermuteDevices, Ne) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({0, 1, 1, 1}));
 }
 
-TEST_P(TensorPermuteDevices, BooleanIndex) {
+TEST_P(TensorPermuteDevicesWithSYCL, BooleanIndex) {
     core::Device device = GetParam();
 
     // a[a < 0] = 0
@@ -2798,7 +2814,7 @@ TEST_P(TensorPermuteDevices, BooleanIndex) {
     EXPECT_EQ(y.GetDtype(), core::Float32);
 }
 
-TEST_P(TensorPermuteDevices, NonZeroNumpy) {
+TEST_P(TensorPermuteDevicesWithSYCL, NonZeroNumpy) {
     core::Device device = GetParam();
 
     core::Tensor a =
@@ -2812,7 +2828,7 @@ TEST_P(TensorPermuteDevices, NonZeroNumpy) {
     EXPECT_EQ(results[1].GetShape(), core::SizeVector{3});
 }
 
-TEST_P(TensorPermuteDevices, All) {
+TEST_P(TensorPermuteDevicesWithSYCL, All) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Init<bool>(
             {{false, true}, {true, false}, {true, false}, {true, true}},
@@ -2843,7 +2859,7 @@ TEST_P(TensorPermuteDevices, All) {
     EXPECT_ANY_THROW(t.All(core::SizeVector({2})));
 }
 
-TEST_P(TensorPermuteDevices, Any) {
+TEST_P(TensorPermuteDevicesWithSYCL, Any) {
     core::Device device = GetParam();
     core::Tensor t = core::Tensor::Init<bool>(
             {{false, true}, {true, false}, {true, false}, {true, true}},
@@ -2875,7 +2891,7 @@ TEST_P(TensorPermuteDevices, Any) {
     EXPECT_ANY_THROW(t.Any(core::SizeVector({2})));
 }
 
-TEST_P(TensorPermuteDevices, CreationEmpty) {
+TEST_P(TensorPermuteDevicesWithSYCL, CreationEmpty) {
     core::Device device = GetParam();
 
     core::Tensor a = core::Tensor::Empty({}, core::Float32, device);
@@ -2899,7 +2915,7 @@ TEST_P(TensorPermuteDevices, CreationEmpty) {
     EXPECT_EQ(a.NumElements(), 6);
 }
 
-TEST_P(TensorPermuteDevices, CreationFull) {
+TEST_P(TensorPermuteDevicesWithSYCL, CreationFull) {
     core::Device device = GetParam();
 
     const float fill_value = 100;
@@ -2934,7 +2950,7 @@ TEST_P(TensorPermuteDevices, CreationFull) {
               std::vector<float>(a.NumElements(), fill_value));
 }
 
-TEST_P(TensorPermuteDevices, CreationZeros) {
+TEST_P(TensorPermuteDevicesWithSYCL, CreationZeros) {
     core::Device device = GetParam();
 
     core::Tensor a = core::Tensor::Zeros({2, 3}, core::Float32, device);
@@ -2943,7 +2959,7 @@ TEST_P(TensorPermuteDevices, CreationZeros) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>(a.NumElements(), 0));
 }
 
-TEST_P(TensorPermuteDevices, CreationOnes) {
+TEST_P(TensorPermuteDevicesWithSYCL, CreationOnes) {
     core::Device device = GetParam();
 
     core::Tensor a = core::Tensor::Ones({2, 3}, core::Float32, device);
@@ -2952,7 +2968,7 @@ TEST_P(TensorPermuteDevices, CreationOnes) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>(a.NumElements(), 1));
 }
 
-TEST_P(TensorPermuteDevices, ScalarOperatorOverload) {
+TEST_P(TensorPermuteDevicesWithSYCL, ScalarOperatorOverload) {
     core::Device device = GetParam();
     core::Tensor a;
     core::Tensor b;
@@ -3038,7 +3054,7 @@ TEST_P(TensorPermuteDevices, ScalarOperatorOverload) {
     EXPECT_EQ(a.ToFlatVector<float>(), std::vector<float>({5, 5}));
 }
 
-TEST_P(TensorPermuteDevices, ReduceMean) {
+TEST_P(TensorPermuteDevicesWithSYCL, ReduceMean) {
     core::Device device = GetParam();
     core::Tensor src;
     core::Tensor dst;
@@ -3129,8 +3145,9 @@ TEST_P(TensorPermuteDevices, ReduceMean) {
     EXPECT_TRUE(std::isnan(dst.ToFlatVector<float>()[0]));
 }
 
-TEST_P(TensorPermuteDevices, ToDLPackFromDLPack) {
+TEST_P(TensorPermuteDevicesWithSYCL, ToDLPackFromDLPack) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
     core::Tensor src_t = core::Tensor::Init<float>(
             {{{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}},
              {{12, 13, 14, 15}, {16, 17, 18, 19}, {20, 21, 22, 23}}},
@@ -3162,7 +3179,7 @@ TEST_P(TensorPermuteDevices, ToDLPackFromDLPack) {
               std::vector<float>({12, 14, 20, 22}));
 }
 
-TEST_P(TensorPermuteDevices, IsSame) {
+TEST_P(TensorPermuteDevicesWithSYCL, IsSame) {
     core::Device device = GetParam();
 
     // "Shallow" copy.
@@ -3206,7 +3223,7 @@ TEST_P(TensorPermuteDevices, IsSame) {
     EXPECT_TRUE(vec[0].IsSame(vec[1]));
 }
 
-TEST_P(TensorPermuteDevices, RValueScalar) {
+TEST_P(TensorPermuteDevicesWithSYCL, RValueScalar) {
     const core::Device &device = GetParam();
     core::Tensor t, t_ref;
 
@@ -3271,7 +3288,7 @@ TEST_P(TensorPermuteDevices, RValueScalar) {
     EXPECT_TRUE(t.AllClose(t_ref));
 }
 
-TEST_P(TensorPermuteDevices, Clip) {
+TEST_P(TensorPermuteDevicesWithSYCL, Clip) {
     core::Device device = GetParam();
     core::Tensor t, t_clip, t_ref;
 
@@ -3324,7 +3341,7 @@ TEST_P(TensorPermuteDevices, Clip) {
     EXPECT_TRUE(t_clip.AllClose(t_ref));
 }
 
-TEST_P(TensorPermuteDevices, Clip_) {
+TEST_P(TensorPermuteDevicesWithSYCL, Clip_) {
     core::Device device = GetParam();
     core::Tensor t, t_ref;
 
@@ -3414,7 +3431,7 @@ TEST_P(TensorPermuteDevicePairs, AllEqual) {
     EXPECT_FALSE(src.AllEqual(dst));
 }
 
-TEST_P(TensorPermuteDevices, Iterator) {
+TEST_P(TensorPermuteDevicesWithSYCL, Iterator) {
     core::Device device = GetParam();
 
     core::Tensor t;
@@ -3492,7 +3509,7 @@ TEST_P(TensorPermuteDevices, Iterator) {
     }
 }
 
-TEST_P(TensorPermuteDevices, ConstIterator) {
+TEST_P(TensorPermuteDevicesWithSYCL, ConstIterator) {
     core::Device device = GetParam();
 
     core::Tensor t;
@@ -3565,7 +3582,7 @@ TEST_P(TensorPermuteDevices, ConstIterator) {
     }
 }
 
-TEST_P(TensorPermuteDevices, TakeOwnership) {
+TEST_P(TensorPermuteDevicesWithSYCL, TakeOwnership) {
     core::Device device = GetParam();
     if (!device.IsCPU()) {
         GTEST_SKIP();
diff --git a/cpp/tests/core/TensorCheck.cpp b/cpp/tests/core/TensorCheck.cpp
index 8b789b481b6..14fb471d874 100644
--- a/cpp/tests/core/TensorCheck.cpp
+++ b/cpp/tests/core/TensorCheck.cpp
@@ -14,10 +14,11 @@
 namespace open3d {
 namespace tests {
 
-class TensorCheckPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(Tensor,
-                         TensorCheckPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TensorCheckPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        Tensor,
+        TensorCheckPermuteDevices,
+        testing::ValuesIn(TensorCheckPermuteDevices::TestCases()));
 
 TEST_P(TensorCheckPermuteDevices, AssertTensorDtype) {
     core::Device device = GetParam();
diff --git a/cpp/tests/core/TensorFunction.cpp b/cpp/tests/core/TensorFunction.cpp
index 0d25b941dfd..17fab12edc9 100644
--- a/cpp/tests/core/TensorFunction.cpp
+++ b/cpp/tests/core/TensorFunction.cpp
@@ -15,10 +15,11 @@
 namespace open3d {
 namespace tests {
 
-class TensorFunctionPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(Tensor,
-                         TensorFunctionPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TensorFunctionPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        Tensor,
+        TensorFunctionPermuteDevices,
+        testing::ValuesIn(TensorFunctionPermuteDevices::TestCases()));
 
 TEST_P(TensorFunctionPermuteDevices, Concatenate) {
     core::Device device = GetParam();
@@ -105,7 +106,7 @@ TEST_P(TensorFunctionPermuteDevices, Concatenate) {
     // Taking the above case of [1, 2] to [2, 2] with different dtype and
     // device.
     EXPECT_ANY_THROW(core::Concatenate({a, b.To(core::Float64), c}));
-    if (device.IsCUDA()) {
+    if (!device.IsCPU()) {
         EXPECT_ANY_THROW(
                 core::Concatenate({a, b.To(core::Device("CPU:0")), c}));
     }
@@ -205,7 +206,7 @@ TEST_P(TensorFunctionPermuteDevices, Append) {
     // Taking the above case of [1, 2] to [2, 2] with different dtype and
     // device.
     EXPECT_ANY_THROW(core::Append(self, other.To(core::Float64)));
-    if (device.IsCUDA()) {
+    if (!device.IsCPU()) {
         EXPECT_ANY_THROW(core::Append(self, other.To(core::Device("CPU:0"))));
     }
 
diff --git a/cpp/tests/core/TensorList.cpp b/cpp/tests/core/TensorList.cpp
index 39091f3d386..67d819a8c9a 100644
--- a/cpp/tests/core/TensorList.cpp
+++ b/cpp/tests/core/TensorList.cpp
@@ -15,10 +15,11 @@
 namespace open3d {
 namespace tests {
 
-class TensorListPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(TensorList,
-                         TensorListPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TensorListPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        TensorList,
+        TensorListPermuteDevices,
+        testing::ValuesIn(TensorListPermuteDevices::TestCases()));
 
 TEST_P(TensorListPermuteDevices, EmptyConstructor) {
     core::Device device = GetParam();
diff --git a/cpp/tests/core/TensorObject.cpp b/cpp/tests/core/TensorObject.cpp
index 0ff294dbc88..62c537efabf 100644
--- a/cpp/tests/core/TensorObject.cpp
+++ b/cpp/tests/core/TensorObject.cpp
@@ -22,12 +22,13 @@
 namespace open3d {
 namespace tests {
 
-class TensorObjectPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(TensorObject,
-                         TensorObjectPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TensorObjectPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        TensorObject,
+        TensorObjectPermuteDevices,
+        testing::ValuesIn(TensorObjectPermuteDevices::TestCases()));
 
-class TensorObjectPermuteDevicePairs : public PermuteDevicePairs {};
+class TensorObjectPermuteDevicePairs : public PermuteDevicePairsWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
         TensorObject,
         TensorObjectPermuteDevicePairs,
diff --git a/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp b/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp
index 2139b40263c..e62f8b89a68 100644
--- a/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp
+++ b/cpp/tests/t/geometry/AxisAlignedBoundingBox.cpp
@@ -19,12 +19,14 @@
 namespace open3d {
 namespace tests {
 
-class AxisAlignedBoundingBoxPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(AxisAlignedBoundingBox,
-                         AxisAlignedBoundingBoxPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class AxisAlignedBoundingBoxPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        AxisAlignedBoundingBox,
+        AxisAlignedBoundingBoxPermuteDevices,
+        testing::ValuesIn(AxisAlignedBoundingBoxPermuteDevices::TestCases()));
 
-class AxisAlignedBoundingBoxPermuteDevicePairs : public PermuteDevicePairs {};
+class AxisAlignedBoundingBoxPermuteDevicePairs
+    : public PermuteDevicePairsWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
         AxisAlignedBoundingBox,
         AxisAlignedBoundingBoxPermuteDevicePairs,
@@ -276,6 +278,7 @@ TEST_P(AxisAlignedBoundingBoxPermuteDevices, GetBoxPoints) {
 
 TEST_P(AxisAlignedBoundingBoxPermuteDevices, GetPointIndicesWithinBoundingBox) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     core::Tensor min_bound = core::Tensor::Init<float>({-1, -1, -1}, device);
     core::Tensor max_bound = core::Tensor::Init<float>({1, 1, 1}, device);
diff --git a/cpp/tests/t/geometry/LineSet.cpp b/cpp/tests/t/geometry/LineSet.cpp
index 30c86eaa78f..272761e3169 100644
--- a/cpp/tests/t/geometry/LineSet.cpp
+++ b/cpp/tests/t/geometry/LineSet.cpp
@@ -16,12 +16,12 @@
 namespace open3d {
 namespace tests {
 
-class LineSetPermuteDevices : public PermuteDevices {};
+class LineSetPermuteDevices : public PermuteDevicesWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(LineSet,
                          LineSetPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+                         testing::ValuesIn(LineSetPermuteDevices::TestCases()));
 
-class LineSetPermuteDevicePairs : public PermuteDevicePairs {};
+class LineSetPermuteDevicePairs : public PermuteDevicePairsWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
         LineSet,
         LineSetPermuteDevicePairs,
@@ -277,6 +277,7 @@ TEST_P(LineSetPermuteDevices, GetMinBound_GetMaxBound_GetCenter) {
 
 TEST_P(LineSetPermuteDevices, Transform) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     t::geometry::LineSet lineset(device);
     core::Tensor transformation = core::Tensor::Init<float>(
@@ -333,6 +334,7 @@ TEST_P(LineSetPermuteDevices, Scale) {
 
 TEST_P(LineSetPermuteDevices, Rotate) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     t::geometry::LineSet lineset(device);
     core::Tensor rotation = core::Tensor::Init<float>(
diff --git a/cpp/tests/t/geometry/OrientedBoundingBox.cpp b/cpp/tests/t/geometry/OrientedBoundingBox.cpp
index 5a730f4cfb2..d2bda07db22 100644
--- a/cpp/tests/t/geometry/OrientedBoundingBox.cpp
+++ b/cpp/tests/t/geometry/OrientedBoundingBox.cpp
@@ -19,12 +19,14 @@
 namespace open3d {
 namespace tests {
 
-class OrientedBoundingBoxPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(OrientedBoundingBox,
-                         OrientedBoundingBoxPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class OrientedBoundingBoxPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        OrientedBoundingBox,
+        OrientedBoundingBoxPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
-class OrientedBoundingBoxPermuteDevicePairs : public PermuteDevicePairs {};
+class OrientedBoundingBoxPermuteDevicePairs
+    : public PermuteDevicePairsWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
         OrientedBoundingBox,
         OrientedBoundingBoxPermuteDevicePairs,
@@ -248,6 +250,7 @@ TEST_P(OrientedBoundingBoxPermuteDevices, GetBoxPoints) {
 
 TEST_P(OrientedBoundingBoxPermuteDevices, GetPointIndicesWithinBoundingBox) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     core::Tensor center = core::Tensor::Init<float>({0.5, 0.5, 0.5}, device);
     core::Tensor rotation = core::Tensor::Eye(3, core::Float32, device);
diff --git a/cpp/tests/t/geometry/PointCloud.cpp b/cpp/tests/t/geometry/PointCloud.cpp
index 2a74c9c75c1..342aa40eb32 100644
--- a/cpp/tests/t/geometry/PointCloud.cpp
+++ b/cpp/tests/t/geometry/PointCloud.cpp
@@ -24,12 +24,13 @@
 namespace open3d {
 namespace tests {
 
-class PointCloudPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(PointCloud,
-                         PointCloudPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class PointCloudPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        PointCloud,
+        PointCloudPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
-class PointCloudPermuteDevicePairs : public PermuteDevicePairs {};
+class PointCloudPermuteDevicePairs : public PermuteDevicePairsWithSYCL {};
 INSTANTIATE_TEST_SUITE_P(
         PointCloud,
         PointCloudPermuteDevicePairs,
@@ -170,6 +171,8 @@ TEST_P(PointCloudPermuteDevices, Copy) {
 
 TEST_P(PointCloudPermuteDevices, Transform) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
+
     core::Dtype dtype = core::Float32;
     t::geometry::PointCloud pcd(device);
     core::Tensor transformation(
@@ -225,6 +228,7 @@ TEST_P(PointCloudPermuteDevices, Scale) {
 
 TEST_P(PointCloudPermuteDevices, Rotate) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
     core::Dtype dtype = core::Float32;
     t::geometry::PointCloud pcd(device);
     core::Tensor rotation(std::vector<float>{1, 1, 0, 0, 1, 1, 0, 1, 0}, {3, 3},
@@ -245,6 +249,7 @@ TEST_P(PointCloudPermuteDevices, Rotate) {
 
 TEST_P(PointCloudPermuteDevices, NormalizeNormals) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     core::Tensor points = core::Tensor::Init<double>({{0, 0, 0},
                                                       {0, 0, 1},
@@ -278,6 +283,7 @@ TEST_P(PointCloudPermuteDevices, NormalizeNormals) {
 
 TEST_P(PointCloudPermuteDevices, EstimateNormals) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     core::Tensor points = core::Tensor::Init<double>({{0, 0, 0},
                                                       {0, 0, 1},
@@ -318,6 +324,7 @@ TEST_P(PointCloudPermuteDevices, EstimateNormals) {
 
 TEST_P(PointCloudPermuteDevices, OrientNormalsToAlignWithDirection) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     core::Tensor points = core::Tensor::Init<double>({{0, 0, 0},
                                                       {0, 0, 1},
@@ -357,6 +364,7 @@ TEST_P(PointCloudPermuteDevices, OrientNormalsToAlignWithDirection) {
 
 TEST_P(PointCloudPermuteDevices, OrientNormalsTowardsCameraLocation) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     core::Tensor points = core::Tensor::Init<double>(
             {{0, 0, 0}, {0, 1, 0}, {1, 0, 0}, {1, 1, 0}}, device);
@@ -644,6 +652,7 @@ TEST_P(PointCloudPermuteDevices, CreateFromRGBDImage) {
     using ::testing::UnorderedElementsAreArray;
 
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
     float depth_scale = 1000.f, depth_max = 3.f;
     int stride = 1;
     core::Tensor im_depth =
@@ -691,6 +700,7 @@ TEST_P(PointCloudPermuteDevices, CreateFromRGBDImage) {
 
 TEST_P(PointCloudPermuteDevices, CreateFromRGBDOrDepthImageWithNormals) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     if (!t::geometry::Image::HAVE_IPP &&
         device.GetType() ==
@@ -891,6 +901,7 @@ TEST_P(PointCloudPermuteDevices, SelectByIndex) {
 
 TEST_P(PointCloudPermuteDevices, VoxelDownSample) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     // Value test
     t::geometry::PointCloud pcd_small(
@@ -969,6 +980,7 @@ TEST_P(PointCloudPermuteDevices, FarthestPointDownSample) {
 
 TEST_P(PointCloudPermuteDevices, RemoveRadiusOutliers) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     const t::geometry::PointCloud pcd_small(
             core::Tensor::Init<float>({{1.0, 1.0, 1.0},
@@ -994,6 +1006,7 @@ TEST_P(PointCloudPermuteDevices, RemoveRadiusOutliers) {
 
 TEST_P(PointCloudPermuteDevices, RemoveStatisticalOutliers) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     data::PCDPointCloud sample_pcd_data;
     geometry::PointCloud pcd_legacy;
@@ -1012,6 +1025,7 @@ TEST_P(PointCloudPermuteDevices, RemoveStatisticalOutliers) {
 
 TEST_P(PointCloudPermuteDevices, RemoveDuplicatedPoints) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     const t::geometry::PointCloud pcd_small(
             core::Tensor::Init<float>({{1.0, 1.0, 1.0},
diff --git a/cpp/tests/t/geometry/TensorMap.cpp b/cpp/tests/t/geometry/TensorMap.cpp
index edf80230073..9ae1a37e300 100644
--- a/cpp/tests/t/geometry/TensorMap.cpp
+++ b/cpp/tests/t/geometry/TensorMap.cpp
@@ -15,10 +15,11 @@
 namespace open3d {
 namespace tests {
 
-class TensorMapPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(TensorMap,
-                         TensorMapPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TensorMapPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        TensorMap,
+        TensorMapPermuteDevices,
+        testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
 
 TEST_P(TensorMapPermuteDevices, Constructor) {
     core::Dtype dtype = core::Float32;
diff --git a/cpp/tests/t/geometry/TriangleMesh.cpp b/cpp/tests/t/geometry/TriangleMesh.cpp
index 9fd3837df84..32d53073ded 100644
--- a/cpp/tests/t/geometry/TriangleMesh.cpp
+++ b/cpp/tests/t/geometry/TriangleMesh.cpp
@@ -26,10 +26,11 @@
 namespace open3d {
 namespace tests {
 
-class TriangleMeshPermuteDevices : public PermuteDevices {};
-INSTANTIATE_TEST_SUITE_P(TriangleMesh,
-                         TriangleMeshPermuteDevices,
-                         testing::ValuesIn(PermuteDevices::TestCases()));
+class TriangleMeshPermuteDevices : public PermuteDevicesWithSYCL {};
+INSTANTIATE_TEST_SUITE_P(
+        TriangleMesh,
+        TriangleMeshPermuteDevices,
+        testing::ValuesIn(TriangleMeshPermuteDevices::TestCases()));
 
 TEST_P(TriangleMeshPermuteDevices, DefaultConstructor) {
     t::geometry::TriangleMesh mesh;
@@ -252,6 +253,7 @@ TEST_P(TriangleMeshPermuteDevices, Has) {
 
 TEST_P(TriangleMeshPermuteDevices, Transform) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     t::geometry::TriangleMesh mesh(device);
     core::Tensor transformation = core::Tensor::Init<float>(
@@ -318,6 +320,7 @@ TEST_P(TriangleMeshPermuteDevices, Scale) {
 
 TEST_P(TriangleMeshPermuteDevices, Rotate) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     t::geometry::TriangleMesh mesh(device);
     core::Tensor rotation = core::Tensor::Init<float>(
@@ -338,6 +341,7 @@ TEST_P(TriangleMeshPermuteDevices, Rotate) {
 
 TEST_P(TriangleMeshPermuteDevices, NormalizeNormals) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     std::shared_ptr<open3d::geometry::TriangleMesh> mesh =
             open3d::geometry::TriangleMesh::CreateSphere(1.0, 3);
@@ -356,6 +360,7 @@ TEST_P(TriangleMeshPermuteDevices, NormalizeNormals) {
 
 TEST_P(TriangleMeshPermuteDevices, ComputeTriangleNormals) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     std::shared_ptr<open3d::geometry::TriangleMesh> mesh =
             open3d::geometry::TriangleMesh::CreateSphere(1.0, 3);
@@ -371,6 +376,7 @@ TEST_P(TriangleMeshPermuteDevices, ComputeTriangleNormals) {
 
 TEST_P(TriangleMeshPermuteDevices, ComputeVertexNormals) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
 
     std::shared_ptr<open3d::geometry::TriangleMesh> mesh =
             open3d::geometry::TriangleMesh::CreateSphere(1.0, 3);
@@ -1223,6 +1229,8 @@ TEST_P(TriangleMeshPermuteDevices, SelectByIndex) {
 
 TEST_P(TriangleMeshPermuteDevices, RemoveUnreferencedVertices) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
+
     t::geometry::TriangleMesh mesh_empty{device};
 
     // check completely empty mesh
@@ -1353,9 +1361,11 @@ TEST_P(TriangleMeshPermuteDevices, RemoveUnreferencedVertices) {
 
 TEST_P(TriangleMeshPermuteDevices, ProjectImagesToAlbedo) {
     using namespace t::geometry;
+    using ::testing::AnyOf;
     using ::testing::ElementsAre;
     using ::testing::FloatEq;
     core::Device device = GetParam();
+    if (!device.IsCPU() || !Image::HAVE_IPP) GTEST_SKIP() << "Not Implemented!";
     TriangleMesh sphere =
             TriangleMesh::FromLegacy(*geometry::TriangleMesh::CreateSphere(
                     1.0, 20, /*create_uv_map=*/true));
@@ -1397,11 +1407,19 @@ TEST_P(TriangleMeshPermuteDevices, ProjectImagesToAlbedo) {
                         .To(core::Float32)
                         .Mean({0, 1})
                         .ToFlatVector<float>(),
-                ElementsAre(FloatEq(87.8693), FloatEq(67.538), FloatEq(64.31)));
+                AnyOf(ElementsAre(FloatEq(87.8693), FloatEq(67.538),
+                                  FloatEq(64.31)),  // macOS
+                      ElementsAre(FloatEq(87.8758),
+                                  FloatEq(67.5518),  // Linux / Windows
+                                  FloatEq(64.3254)))
+
+    );
 }  // namespace tests
 
 TEST_P(TriangleMeshPermuteDevices, ComputeTriangleAreas) {
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
+
     t::geometry::TriangleMesh mesh_empty;
     EXPECT_NO_THROW(mesh_empty.ComputeTriangleAreas());
 
@@ -1424,6 +1442,8 @@ TEST_P(TriangleMeshPermuteDevices, ComputeTriangleAreas) {
 TEST_P(TriangleMeshPermuteDevices, RemoveNonManifoldEdges) {
     using ::testing::UnorderedElementsAreArray;
     core::Device device = GetParam();
+    if (device.IsSYCL()) GTEST_SKIP() << "Not Implemented!";
+
     t::geometry::TriangleMesh mesh_empty(device);
     EXPECT_TRUE(mesh_empty.RemoveNonManifoldEdges().IsEmpty());
 
diff --git a/cpp/tests/t/io/ImageIO.cpp b/cpp/tests/t/io/ImageIO.cpp
index db6be27041c..2c614481bd8 100644
--- a/cpp/tests/t/io/ImageIO.cpp
+++ b/cpp/tests/t/io/ImageIO.cpp
@@ -166,7 +166,7 @@ TEST(ImageIO, WriteImageToJPG) {
     EXPECT_TRUE(img.AsTensor().AllClose(read_img.AsTensor()));
 }
 
-// JPG supports only UInt8, and PNG supports both UInt8 and UInt16.
+// JPG supports only UInt8, and PNG supports Bool, UInt8 and UInt16.
 // All other data types are expected to fail.
 TEST(ImageIO, DifferentDtype) {
     const std::string tmp_path = utility::filesystem::GetTempDirectoryPath();
@@ -210,9 +210,8 @@ TEST(ImageIO, DifferentDtype) {
     EXPECT_FALSE(
             t::io::WriteImage(tmp_path + "/test_imageio_dtype.png",
                               t::geometry::Image(100, 200, 3, core::Int64)));
-    EXPECT_FALSE(
-            t::io::WriteImage(tmp_path + "/test_imageio_dtype.png",
-                              t::geometry::Image(100, 200, 3, core::Bool)));
+    EXPECT_TRUE(t::io::WriteImage(tmp_path + "/test_imageio_dtype.png",
+                                  t::geometry::Image(100, 200, 3, core::Bool)));
 }
 
 TEST(ImageIO, CornerCases) {
diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
index 189fa6c50b0..c15dabae54a 100755
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -19,17 +19,17 @@ ARG PACKAGE
 ARG BUILD_SYCL_MODULE
 ARG CI
 
-RUN if [ -z "${DEVELOPER_BUILD}"      ]; then echo "Error: ARG DEVELOPER_BUILD      not specified."; exit 1; fi \
- && if [ -z "${CCACHE_TAR_NAME}"      ]; then echo "Error: ARG CCACHE_TAR_NAME      not specified."; exit 1; fi \
- && if [ -z "${CMAKE_VERSION}"        ]; then echo "Error: ARG CMAKE_VERSION        not specified."; exit 1; fi \
- && if [ -z "${CCACHE_VERSION}"       ]; then echo "Error: ARG CCACHE_VERSION       not specified."; exit 1; fi \
- && if [ -z "${PYTHON_VERSION}"       ]; then echo "Error: ARG PYTHON_VERSION       not specified."; exit 1; fi \
- && if [ -z "${BUILD_SHARED_LIBS}"               ]; then echo "Error: ARG BUILD_SHARED_LIBS               not specified."; exit 1; fi \
- && if [ -z "${BUILD_CUDA_MODULE}"    ]; then echo "Error: ARG BUILD_CUDA_MODULE    not specified."; exit 1; fi \
- && if [ -z "${BUILD_TENSORFLOW_OPS}" ]; then echo "Error: ARG BUILD_TENSORFLOW_OPS not specified."; exit 1; fi \
- && if [ -z "${BUILD_PYTORCH_OPS}"    ]; then echo "Error: ARG BUILD_PYTORCH_OPS    not specified."; exit 1; fi \
- && if [ -z "${PACKAGE}"              ]; then echo "Error: ARG PACKAGE              not specified."; exit 1; fi \
- && if [ -z "${BUILD_SYCL_MODULE}"    ]; then echo "Error: ARG BUILD_SYCL_MODULE    not specified."; exit 1; fi
+RUN if [[ -z "${DEVELOPER_BUILD}"      ]]; then echo "Error: ARG DEVELOPER_BUILD      not specified."; exit 1; fi \
+ && if [[ -z "${CCACHE_TAR_NAME}"      ]]; then echo "Error: ARG CCACHE_TAR_NAME      not specified."; exit 1; fi \
+ && if [[ -z "${CMAKE_VERSION}"        ]]; then echo "Error: ARG CMAKE_VERSION        not specified."; exit 1; fi \
+ && if [[ -z "${CCACHE_VERSION}"       ]]; then echo "Error: ARG CCACHE_VERSION       not specified."; exit 1; fi \
+ && if [[ -z "${PYTHON_VERSION}"       ]]; then echo "Error: ARG PYTHON_VERSION       not specified."; exit 1; fi \
+ && if [[ -z "${BUILD_SHARED_LIBS}"    ]]; then echo "Error: ARG BUILD_SHARED_LIBS    not specified."; exit 1; fi \
+ && if [[ -z "${BUILD_CUDA_MODULE}"    ]]; then echo "Error: ARG BUILD_CUDA_MODULE    not specified."; exit 1; fi \
+ && if [[ -z "${BUILD_TENSORFLOW_OPS}" ]]; then echo "Error: ARG BUILD_TENSORFLOW_OPS not specified."; exit 1; fi \
+ && if [[ -z "${BUILD_PYTORCH_OPS}"    ]]; then echo "Error: ARG BUILD_PYTORCH_OPS    not specified."; exit 1; fi \
+ && if [[ -z "${PACKAGE}"              ]]; then echo "Error: ARG PACKAGE              not specified."; exit 1; fi \
+ && if [[ -z "${BUILD_SYCL_MODULE}"    ]]; then echo "Error: ARG BUILD_SYCL_MODULE    not specified."; exit 1; fi
 
 # Fix Nvidia repo key rotation issue
 # https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771
@@ -87,7 +87,15 @@ RUN apt-get update && apt-get install -y \
     libxmlsec1-dev \
     libffi-dev \
     liblzma-dev \
+ && if [ "${BUILD_SYCL_MODULE}" = "ON" ]; then \
+         add-apt-repository -y ppa:ubuntu-toolchain-r/test \
+        && apt-get install -y g++-11 \
+        && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 \
+        && update-alternatives --set g++ /usr/bin/g++-11 \
+        && c++ -v; \
+    fi \
  && rm -rf /var/lib/apt/lists/*
+# OneDPL TBB backend requires libstdc++ >= v11. This makes the created wheel UBuntu 22.04+ only.
 
 # pyenv
 # The pyenv python paths are used during docker run, in this way docker run
@@ -224,9 +232,9 @@ RUN \
  && make VERBOSE=1 -j$(nproc) \
  && make install-pip-package -j$(nproc) \
  && make install -j$(nproc) \
- && if [ "${PACKAGE}" = "ON" ]; then make package; fi \
- && if [ "${PACKAGE}" = "VIEWER" ]; then make package-Open3DViewer-deb; fi \
- && if [ "${CI:-}a" != "a" ]; then rm -rf _deps assimp embree ipp mkl mkl_install webrtc; fi
+ && if [[ "${PACKAGE}" = "ON" ]]; then make package; fi \
+ && if [[ "${PACKAGE}" = "VIEWER" ]]; then make package-Open3DViewer-deb; fi \
+ && if [[ "${CI:-}a" != "a" ]]; then rm -rf _deps assimp embree ipp mkl mkl_install webrtc librealsense; fi
 # If CI is not null or unset, remove all large build folders to save disk space
 
 # Compress ccache folder, move to / directory
@@ -236,8 +244,9 @@ RUN ccache -s \
  && CCACHE_DIR_PARENT=$(dirname ${CCACHE_DIR}) \
  && cd ${CCACHE_DIR_PARENT} \
  && tar -caf /${CCACHE_TAR_NAME}.tar.xz ${CCACHE_DIR_NAME} \
- && if [ "${PACKAGE}" = "ON" ]; then mv /root/Open3D/build/package/open3d-devel*.tar.xz /; fi \
- && if [ "${PACKAGE}" = "VIEWER" ]; then mv /root/Open3D/build/package-Open3DViewer-deb/open3d-viewer-*-Linux.deb /; fi \
+ && if [[ "${PACKAGE}" = "ON" ]]; then mv /root/Open3D/build/package/open3d-devel*.tar.xz /; fi \
+ && if [[ "${PACKAGE}" = "VIEWER" ]]; then mv /root/Open3D/build/package-Open3DViewer-deb/open3d-viewer-*-Linux.deb /; fi \
+ && if [[ "${BUILD_SYCL_MODULE}" = "ON"  && "${BUILD_SHARED_LIBS}" = "ON" ]]; then mv /root/Open3D/build/lib/python_package/pip_package/open3d-*.whl /; fi \
  && ls -alh /
 
 RUN echo "Docker build done."
diff --git a/docker/docker_build.sh b/docker/docker_build.sh
index e6c22bb10df..176657c795b 100755
--- a/docker/docker_build.sh
+++ b/docker/docker_build.sh
@@ -451,13 +451,17 @@ sycl-shared_export_env() {
     export BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
     export DEVELOPER_BUILD=ON
     export CCACHE_TAR_NAME=open3d-ci-sycl
-    export PYTHON_VERSION=3.8
+    export PYTHON_VERSION=3.10
     export BUILD_SHARED_LIBS=ON
     export BUILD_CUDA_MODULE=OFF
-    export BUILD_TENSORFLOW_OPS=OFF
-    export BUILD_PYTORCH_OPS=OFF
-    export PACKAGE=OFF
+    export BUILD_TENSORFLOW_OPS=ON
+    export BUILD_PYTORCH_OPS=ON
+    export PACKAGE=ON
     export BUILD_SYCL_MODULE=ON
+
+    export IGC_EnableDPEmulation=1       # Enable float64 emulation during compilation
+    export SYCL_CACHE_PERSISTENT=1       # Cache SYCL kernel binaries.
+    export OverrideDefaultFP64Settings=1 # Enable double precision emulation at runtime.
 }
 
 sycl-static_export_env() {
@@ -468,13 +472,17 @@ sycl-static_export_env() {
     export BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
     export DEVELOPER_BUILD=ON
     export CCACHE_TAR_NAME=open3d-ci-sycl
-    export PYTHON_VERSION=3.8
+    export PYTHON_VERSION=3.10
     export BUILD_SHARED_LIBS=OFF
     export BUILD_CUDA_MODULE=OFF
     export BUILD_TENSORFLOW_OPS=OFF
     export BUILD_PYTORCH_OPS=OFF
     export PACKAGE=OFF
     export BUILD_SYCL_MODULE=ON
+
+    export IGC_EnableDPEmulation=1       # Enable float64 emulation during compilation
+    export SYCL_CACHE_PERSISTENT=1       # Cache SYCL kernel binaries.
+    export OverrideDefaultFP64Settings=1 # Enable double precision emulation at runtime.
 }
 
 function main() {
diff --git a/docs/index.rst b/docs/index.rst
index 09dda4f1dee..daa572d280d 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -24,6 +24,7 @@ Open3D: A Modern Library for 3D Data Processing
     builddocs
     docker
     arm
+    sycl
     open3d_ml
 
 .. toctree::
diff --git a/docs/sycl.rst b/docs/sycl.rst
new file mode 100644
index 00000000000..e17d085f79f
--- /dev/null
+++ b/docs/sycl.rst
@@ -0,0 +1,126 @@
+.. _sycl:
+
+Cross-platform GPU support (SYCL)
+=================================
+
+From v0.19, Open3D provides an experimental SYCL backend for cross-platform GPU
+support. This backend allows Open3D operations to run on many different GPUs,
+including integrated GPUs and discrete GPUs from Intel, Nvidia and AMD. We
+provide pre-built C++ binaries and Python 3.10 wheels for Linux (Ubuntu 22.04+).
+
+Enabled features
+-----------------
+
+Many Tensor API operations and Tensor Geometry operations without custom kernels
+can now be offloaded to SYCL devices. In addition, HW accelerated raycasting
+queries in :py:class:`open3d.t.geometry.RayCastingScene` are also supported. You
+will get an error if an operation is not supported. The implementation is tested
+on Linux on Intel integrated and discrete GPUs. Currently, a single GPU
+(`SYCL:0`, if available) and the CPU (`SYCL:1` if a GPU is available, else
+`SYCL:0`) are supported.
+
+Installation
+-------------
+
+Both C++ binaries and Python wheels (Python 3.10 only for now) can be downloaded
+from the Open3D GitHub releases page. For C++, install the `OneAPI runtime
+<https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html>`_
+and (optionally) SYCL runtime for your `Nvidia
+<https://developer.codeplay.com/products/oneapi/nvidia/download>`_ or `AMD
+<https://developer.codeplay.com/products/oneapi/amd/download>`_ GPU.
+
+For Python, the wheels will automatically install the DPC++ runtime package
+(`dpcpp-cpp-rt`).  You will also need `libomp5` installed: `apt-get install
+libomp5-11`. Make sure to have the `correct drivers installed 
+<https://dgpu-docs.intel.com/driver/client/overview.html>`_ for your GPU. For
+raycasting on Intel GPUs, you will also need the
+`intel-level-zero-gpu-raytracing` package.
+
+Usage
+------
+
+The SYCL backend requires the new CXX11 ABI (Linux, gcc, libstdc++ only). If you
+need to use the Open3D PyTorch extension, you should use cxx11_abi wheels for
+PyTorch:
+
+.. code-block:: shell
+
+    pip install torch==2.2.2+cpu.cxx11.abi -i https://download.pytorch.org/whl/cpu/
+
+Some GPUs do not have native double precision support. For Intel GPUs, you can
+emulate support with these environment variables:
+
+.. code-block:: shell
+
+    export IGC_EnableDPEmulation=1          # Enable float64 emulation during compilation 
+    export OverrideDefaultFP64Settings=1    # Enable double precision emulation at runtime.
+
+The binaries only contain kernels compiled to SPIR-V IR. At runtime, they will
+be JIT compiled to your target GPU's native ISA. This means that the first run
+of a kernel on a new device will be slower than subsequent runs.  Use this
+environment variable to cache the JIT compiled kernels to your home directory:
+
+.. code-block:: shell
+
+    export SYCL_CACHE_PERSISTENT=1          # Cache SYCL kernel binaries.
+
+.. code-block:: python
+
+    import open3d as o3d
+    o3d.core.sycl.enable_persistent_jit_cache() # Cache SYCL kernel binaries.
+
+For multi-GPU systems (e.g. with both integrated and discrete GPUs), the more
+powerful GPU is automatically selected, as long as the correct GPU drivers and
+SYCL runtime are installed. You can select a specific device with the
+`ONEAPI_DEVICE_FILTER` or `SYCL_DEVICE_ALLOWLIST`  `environment variables
+<https://intel.github.io/llvm/EnvironmentVariables.html>`_.
+
+
+.. code-block:: shell
+
+    # Print all available devices (command line):
+    sycl-ls
+    # Examples:
+    export ONEAPI_DEVICE_SELECTOR="opencl:1"    # Select the 2nd OpenCL device
+
+
+.. code-block:: python
+
+    # Print all available devices (Python):
+    import os os.environ["SYCL_DEVICE_ALLOWLIST"] = "BackendName:cuda"  # Select CUDA GPU
+    import open3d as o3d
+    o3d.core.sycl.print_sycl_devices(print_all=true)
+
+    # Return a list of available devices.
+    o3d.core.sycl.get_available_device() 
+
+    # Check if a device is available
+    o3d.core.sycl.is_available(o3d.core.Device("SYCL:0"))  
+
+
+Building from source
+---------------------
+
+You can build the binaries from source as shown below. To build for a different
+Python version, set the `PYTHON_VERSION` variable in `docker/docker_build.sh`.
+
+.. code-block:: shell
+
+    cd docker 
+    ./docker_build.sh sycl-shared
+
+This will create the Python wheel and C++ binary archive in the current
+directory.
+
+You can directly compile for a specific target device (i.e. ahead of time or AOT
+compilation) using the OPEN3D_SYCL_TARGETS (`-fsycl-target` compiler option) and
+OPEN3D_SYCL_TARGET_BACKEND_OPTIONS (`-Xs` compiler option) CMake variables in
+Open3D. See the `compiler documentation
+<https://github.com/intel/llvm/blob/sycl/sycl/doc/UsersManual.md>`_ for
+information about building for specific hardware.
+
+if you want to use different settings (e.g. AOT compilation for a specific
+device, or build a wheel for a different Python version), you can update the
+``docker_build.sh`` script, or build directly on host after installing the
+``intel-basekit`` or ``intel-cpp-essentials`` Debian packages from the Intel
+OneAPI repository.
\ No newline at end of file
diff --git a/python/requirements.txt b/python/requirements.txt
index a7da4963446..66db20516fb 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,4 +1,4 @@
-numpy>=1.18.0,<2.0.0
+numpy>=1.18.0
 dash>=2.6.0
 werkzeug>=3.0.0
 flask>=3.0.0
diff --git a/python/requirements_sycl.txt b/python/requirements_sycl.txt
new file mode 100644
index 00000000000..31f4c8a05cb
--- /dev/null
+++ b/python/requirements_sycl.txt
@@ -0,0 +1 @@
+dpcpp-cpp-rt==2024.1.0
\ No newline at end of file
diff --git a/python/setup.py b/python/setup.py
index 341d91f361c..b2be5b93af7 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -154,7 +154,8 @@ def finalize_options(self):
     long_description = readme.read()
 # open3d-cpu wheel for Linux x86_64
 if sys.platform.startswith("linux") and platform.machine() in (
-        'i386', 'x86_64', 'AMD64') and "@BUILD_CUDA_MODULE@" == "OFF":
+        'i386', 'x86_64', 'AMD64'
+) and "@BUILD_CUDA_MODULE@" == "OFF" and "@BUILD_SYCL_MODULE@" == "OFF":
     name += "-cpu"
     long_description += ("\n\nThis wheel only contains CPU functionality. "
                          "Use the open3d wheel for full functionality.")
diff --git a/python/test/core/test_core.py b/python/test/core/test_core.py
index c94a961e1fe..2f344381a57 100644
--- a/python/test/core/test_core.py
+++ b/python/test/core/test_core.py
@@ -68,7 +68,7 @@ def to_numpy_dtype(dtype: o3c.Dtype):
 
 
 @pytest.mark.parametrize("dtype", list_dtypes())
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_creation(dtype, device):
     # Shape takes tuple, list or o3c.SizeVector
     t = o3c.Tensor.empty((2, 3), dtype, device=device)
@@ -95,7 +95,7 @@ def test_creation(dtype, device):
 @pytest.mark.parametrize("shape", [(), (0,), (1,), (0, 2), (0, 0, 2),
                                    (2, 0, 3)])
 @pytest.mark.parametrize("dtype", list_dtypes())
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_creation_special_shapes(shape, dtype, device):
     o3_t = o3c.Tensor.full(shape, 3.14, dtype, device=device)
     np_t = np.full(shape, 3.14, dtype=to_numpy_dtype(dtype))
@@ -128,7 +128,7 @@ def test_device():
 
 
 @pytest.mark.parametrize("dtype", list_dtypes())
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_tensor_constructor(dtype, device):
     # Numpy array
     np_t = np.array([[0, 1, 2], [3, 4, 5]], dtype=to_numpy_dtype(dtype))
@@ -177,7 +177,7 @@ def test_tensor_constructor(dtype, device):
     np.testing.assert_equal(np_t, o3_t.cpu().numpy())
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_arange(device):
     # Full parameters.
     setups = [(0, 10, 1), (0, 10, 1), (0.0, 10.0, 2.0), (0.0, -10.0, -2.0)]
@@ -225,7 +225,7 @@ def test_arange(device):
     np.testing.assert_equal(np_t, o3_t.cpu().numpy())
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_flatten(device):
 
     # Flatten 0-D tensor
@@ -370,7 +370,7 @@ def test_flatten(device):
 
 
 @pytest.mark.parametrize("dtype", list_non_bool_dtypes())
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_append(dtype, device):
     # Appending 0-D.
     # 0-D can only be appended along axis = null.
@@ -585,7 +585,7 @@ def get_dst_t():
 
 
 @pytest.mark.parametrize("dtype", list_non_bool_dtypes())
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_binary_ew_ops(dtype, device):
     a = o3c.Tensor(np.array([4, 6, 8, 10, 12, 14]), dtype=dtype, device=device)
     b = o3c.Tensor(np.array([2, 3, 4, 5, 6, 7]), dtype=dtype, device=device)
@@ -613,7 +613,7 @@ def test_binary_ew_ops(dtype, device):
     np.testing.assert_equal(a.cpu().numpy(), np.array([2, 2, 2, 2, 2, 2]))
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_to(device):
     a = o3c.Tensor(np.array([0.1, 1.2, 2.3, 3.4, 4.5, 5.6]).astype(np.float32),
                    device=device)
@@ -625,7 +625,7 @@ def test_to(device):
     assert b.device == a.device
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_unary_ew_ops(device):
     src_vals = np.array([0, 1, 2, 3, 4, 5]).astype(np.float32)
     src = o3c.Tensor(src_vals, device=device)
@@ -658,7 +658,7 @@ def test_unary_ew_ops(device):
                                atol=atol)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_getitem(device):
     np_t = np.array(range(24)).reshape((2, 3, 4))
     o3_t = o3c.Tensor(np_t, device=device)
@@ -700,7 +700,7 @@ def test_getitem(device):
         o3c.Tensor.ones((), device=device)[0:1]
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_setitem(device):
     np_ref = np.array(range(24)).reshape((2, 3, 4))
 
@@ -821,7 +821,7 @@ def test_setitem(device):
     "dim",
     [0, 1, 2, (), (0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2), None])
 @pytest.mark.parametrize("keepdim", [True, False])
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_reduction_sum(dim, keepdim, device):
     np_src = np.array(range(24)).reshape((2, 3, 4))
     o3_src = o3c.Tensor(np_src, device=device)
@@ -840,7 +840,7 @@ def test_reduction_sum(dim, keepdim, device):
     ((0, 2), (1)),
 ])
 @pytest.mark.parametrize("keepdim", [True, False])
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_reduction_special_shapes(shape_and_axis, keepdim, device):
     shape, axis = shape_and_axis
     np_src = np.array(np.random.rand(*shape))
@@ -856,7 +856,7 @@ def test_reduction_special_shapes(shape_and_axis, keepdim, device):
     "dim",
     [0, 1, 2, (), (0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2), None])
 @pytest.mark.parametrize("keepdim", [True, False])
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_reduction_mean(dim, keepdim, device):
     np_src = np.array(range(24)).reshape((2, 3, 4)).astype(np.float32)
     o3_src = o3c.Tensor(np_src, device=device)
@@ -870,7 +870,7 @@ def test_reduction_mean(dim, keepdim, device):
     "dim",
     [0, 1, 2, (), (0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2), None])
 @pytest.mark.parametrize("keepdim", [True, False])
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_reduction_prod(dim, keepdim, device):
     np_src = np.array(range(24)).reshape((2, 3, 4))
     o3_src = o3c.Tensor(np_src, device=device)
@@ -884,7 +884,7 @@ def test_reduction_prod(dim, keepdim, device):
     "dim",
     [0, 1, 2, (), (0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2), None])
 @pytest.mark.parametrize("keepdim", [True, False])
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_reduction_min(dim, keepdim, device):
     np_src = np.array(range(24))
     np.random.shuffle(np_src)
@@ -900,7 +900,7 @@ def test_reduction_min(dim, keepdim, device):
     "dim",
     [0, 1, 2, (), (0,), (1,), (2,), (0, 1), (0, 2), (1, 2), (0, 1, 2), None])
 @pytest.mark.parametrize("keepdim", [True, False])
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_reduction_max(dim, keepdim, device):
     np_src = np.array(range(24))
     np.random.shuffle(np_src)
@@ -913,7 +913,7 @@ def test_reduction_max(dim, keepdim, device):
 
 
 @pytest.mark.parametrize("dim", [0, 1, 2, None])
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_reduction_argmin_argmax(dim, device):
     np_src = np.array(range(24))
     np.random.shuffle(np_src)
@@ -929,7 +929,7 @@ def test_reduction_argmin_argmax(dim, device):
     np.testing.assert_allclose(o3_dst.cpu().numpy(), np_dst)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_advanced_index_get_mixed(device):
     np_src = np.array(range(24)).reshape((2, 3, 4))
     o3_src = o3c.Tensor(np_src, device=device)
@@ -955,7 +955,7 @@ def test_advanced_index_get_mixed(device):
     np.testing.assert_equal(o3_dst.cpu().numpy(), np_dst)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_advanced_index_set_mixed(device):
     np_src = np.array(range(24)).reshape((2, 3, 4))
     o3_src = o3c.Tensor(np_src, device=device)
@@ -988,7 +988,7 @@ def test_advanced_index_set_mixed(device):
                                                        ("ceil", "ceil"),
                                                        ("round", "round"),
                                                        ("trunc", "trunc")])
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_unary_elementwise(np_func_name, o3_func_name, device):
     np_t = np.array([-3.4, -2.6, -1.5, 0, 1.4, 2.6, 3.5]).astype(np.float32)
     o3_t = o3c.Tensor(np_t, device=device)
@@ -1010,7 +1010,7 @@ def test_unary_elementwise(np_func_name, o3_func_name, device):
                                    atol=1e-7)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_logical_ops(device):
     np_a = np.array([True, False, True, False])
     np_b = np.array([True, True, False, False])
@@ -1030,7 +1030,7 @@ def test_logical_ops(device):
     np.testing.assert_equal(o3_r.cpu().numpy(), np_r)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_comparision_ops(device):
     np_a = np.array([0, 1, -1])
     np_b = np.array([0, 0, 0])
@@ -1045,7 +1045,7 @@ def test_comparision_ops(device):
     np.testing.assert_equal((o3_a != o3_b).cpu().numpy(), np_a != np_b)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_non_zero(device):
     np_x = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]])
     np_nonzero_tuple = np.nonzero(np_x)
@@ -1055,7 +1055,7 @@ def test_non_zero(device):
         np.testing.assert_equal(np_t, o3_t.cpu().numpy())
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_boolean_advanced_indexing(device):
     np_a = np.array([1, -1, -2, 3])
     o3_a = o3c.Tensor(np_a, device=device)
@@ -1140,7 +1140,7 @@ def test_boolean_advanced_indexing(device):
                                                             device=device)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_scalar_op(device):
     # +
     a = o3c.Tensor.ones((2, 3), o3c.float32, device=device)
@@ -1390,7 +1390,7 @@ def test_scalar_op(device):
     np.testing.assert_equal(a.cpu().numpy(), np.array([1, 0, 1]))
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_all_any(device):
     a = o3c.Tensor([False, True, True, True], dtype=o3c.bool, device=device)
     assert not a.all()
@@ -1405,7 +1405,7 @@ def test_all_any(device):
     assert not a.any()
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_allclose_isclose(device):
     a = o3c.Tensor([1, 2], device=device)
     b = o3c.Tensor([1, 3], device=device)
@@ -1430,7 +1430,7 @@ def test_allclose_isclose(device):
     assert not a.allclose(b)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_issame(device):
     dtype = o3c.float32
     a = o3c.Tensor.ones((2, 3), dtype, device=device)
@@ -1448,7 +1448,7 @@ def test_issame(device):
     assert d.issame(e)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_item(device):
     o3_t = o3c.Tensor.ones((2, 3), dtype=o3c.float32, device=device) * 1.5
     assert o3_t[0, 0].item() == 1.5
@@ -1471,7 +1471,7 @@ def test_item(device):
     assert isinstance(o3_t[0, 0].item(), bool)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_save_load(device):
     with tempfile.TemporaryDirectory() as temp_dir:
         file_name = f"{temp_dir}/tensor.npy"
@@ -1535,7 +1535,7 @@ def test_save_load(device):
         np.testing.assert_equal(o3_t_load.cpu().numpy(), np_t)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_iterator(device):
     # 0-d.
     o3_t = o3c.Tensor.ones((), dtype=o3c.float32, device=device)
@@ -1572,7 +1572,7 @@ def test_iterator(device):
                             np.array([[0, 10, 20], [30, 40, 50]]))
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_pickle(device):
     o3_t = o3c.Tensor.ones((100), dtype=o3c.float32, device=device)
     with tempfile.TemporaryDirectory() as temp_dir:
diff --git a/python/test/core/test_linalg.py b/python/test/core/test_linalg.py
index 7d1580570f8..7ab8908394f 100644
--- a/python/test/core/test_linalg.py
+++ b/python/test/core/test_linalg.py
@@ -17,7 +17,7 @@
 from open3d_test import list_devices
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype",
                          [o3c.int32, o3c.int64, o3c.float32, o3c.float64])
 def test_matmul(device, dtype):
@@ -71,7 +71,7 @@ def test_matmul(device, dtype):
             assert 'dimensions with zero' in str(excinfo.value)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype", [o3c.float32, o3c.float64])
 def test_addmm(device, dtype):
     # Shape takes tuple, list or o3c.SizeVector
@@ -141,7 +141,7 @@ def test_addmm(device, dtype):
             assert 'dimensions with zero' in str(excinfo.value)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype",
                          [o3c.int32, o3c.int64, o3c.float32, o3c.float64])
 def test_det(device, dtype):
@@ -172,7 +172,7 @@ def test_det(device, dtype):
         assert 'must be square' in str(excinfo.value)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype",
                          [o3c.int32, o3c.int64, o3c.float32, o3c.float64])
 def test_lu(device, dtype):
@@ -203,7 +203,7 @@ def test_lu(device, dtype):
             assert 'must be 2D' in str(excinfo.value)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype",
                          [o3c.int32, o3c.int64, o3c.float32, o3c.float64])
 def test_lu_ipiv(device, dtype):
@@ -237,7 +237,7 @@ def test_lu_ipiv(device, dtype):
             assert 'must be 2D' in str(excinfo.value)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype",
                          [o3c.int32, o3c.int64, o3c.float32, o3c.float64])
 def test_inverse(device, dtype):
@@ -289,7 +289,7 @@ def test_inverse(device, dtype):
             assert 'Singular matrix' in str(excinfo.value)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype",
                          [o3c.int32, o3c.int64, o3c.float32, o3c.float64])
 def test_svd(device, dtype):
@@ -351,7 +351,7 @@ def test_svd(device, dtype):
         assert 'must be 2D' in str(excinfo.value)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype", [o3c.float32, o3c.float64])
 def test_solve(device, dtype):
     # Test square
@@ -369,7 +369,7 @@ def test_solve(device, dtype):
         assert 'singular' in str(excinfo.value)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype", [o3c.float32, o3c.float64])
 def test_lstsq(device, dtype):
     # Test square
@@ -416,7 +416,7 @@ def test_lstsq(device, dtype):
                 a_shape[0], a_shape[1]) in str(excinfo.value)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype",
                          [o3c.int32, o3c.int64, o3c.float32, o3c.float64])
 def test_thiu(device, dtype):
@@ -440,7 +440,7 @@ def test_thiu(device, dtype):
                                atol=1e-5)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype",
                          [o3c.int32, o3c.int64, o3c.float32, o3c.float64])
 def test_thil(device, dtype):
@@ -464,7 +464,7 @@ def test_thil(device, dtype):
                                atol=1e-5)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 @pytest.mark.parametrize("dtype",
                          [o3c.int32, o3c.int64, o3c.float32, o3c.float64])
 def test_thiul(device, dtype):
diff --git a/python/test/core/test_size_vector.py b/python/test/core/test_size_vector.py
index 57ab9f8ab88..b4e80a7a1b5 100644
--- a/python/test/core/test_size_vector.py
+++ b/python/test/core/test_size_vector.py
@@ -67,7 +67,7 @@ def test_size_vector():
         sv = o3d.core.SizeVector(["foo", "bar"])
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_implicit_conversion(device):
     # Reshape
     t = o3d.core.Tensor.ones((3, 4), device=device)
diff --git a/python/test/core/test_tensor_function.py b/python/test/core/test_tensor_function.py
index 4a45b06f599..7f4390e8706 100644
--- a/python/test/core/test_tensor_function.py
+++ b/python/test/core/test_tensor_function.py
@@ -49,7 +49,7 @@ def list_non_bool_dtypes():
 
 
 @pytest.mark.parametrize("dtype", list_non_bool_dtypes())
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_concatenate(dtype, device):
 
     # 0-D cannot be concatenated.
@@ -200,7 +200,7 @@ def test_concatenate(dtype, device):
 
 
 @pytest.mark.parametrize("dtype", list_non_bool_dtypes())
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_append(dtype, device):
     # Appending 0-D.
     # 0-D can only be appended along axis = null.
diff --git a/python/test/ml_ops/test_voxel_pooling.py b/python/test/ml_ops/test_voxel_pooling.py
index a5198cdfc42..7fb85366571 100644
--- a/python/test/ml_ops/test_voxel_pooling.py
+++ b/python/test/ml_ops/test_voxel_pooling.py
@@ -80,7 +80,9 @@ def test_voxel_pooling(ml, pos_dtype, feat_dtype, position_fn, feature_fn):
     else:
         index = [1, 0]
 
-    np.testing.assert_allclose(ans.pooled_positions, expected_positions[index])
+    np.testing.assert_allclose(ans.pooled_positions,
+                               expected_positions[index],
+                               rtol=1.1e-7)
 
     if feature_fn == 'average':
         if np.issubdtype(feat_dtype, np.integer):
@@ -99,7 +101,9 @@ def test_voxel_pooling(ml, pos_dtype, feat_dtype, position_fn, feature_fn):
     elif feature_fn == 'nearest_neighbor':
         expected_features = np.array([features[0], features[3]])
 
-    np.testing.assert_allclose(ans.pooled_features, expected_features[index])
+    np.testing.assert_allclose(ans.pooled_features,
+                               expected_features[index],
+                               rtol=1.1e-7)
 
 
 @mltest.parametrize.ml_cpu_only
diff --git a/python/test/t/geometry/test_lineset.py b/python/test/t/geometry/test_lineset.py
index 2d3daceccaa..143c1a02159 100644
--- a/python/test/t/geometry/test_lineset.py
+++ b/python/test/t/geometry/test_lineset.py
@@ -35,7 +35,7 @@ def test_extrude_linear():
     assert ans.triangle.indices.shape == (4, 3)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_pickle(device):
     line = o3d.t.geometry.LineSet([[0.7, 0, 0], [1, 0, 0]], [[0, 1]]).to(device)
     with tempfile.TemporaryDirectory() as temp_dir:
diff --git a/python/test/t/geometry/test_pointcloud.py b/python/test/t/geometry/test_pointcloud.py
index dc98b1db951..74f5d5f27c0 100644
--- a/python/test/t/geometry/test_pointcloud.py
+++ b/python/test/t/geometry/test_pointcloud.py
@@ -19,7 +19,7 @@
 from open3d_test import list_devices
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_constructor_and_accessors(device):
     dtype = o3c.float32
 
@@ -46,7 +46,7 @@ def test_constructor_and_accessors(device):
     assert pcd.point.positions.allclose(o3c.Tensor([[1, 2, 3]], dtype, device))
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_from_legacy(device):
     dtype = o3c.float32
 
@@ -63,7 +63,7 @@ def test_from_legacy(device):
         o3c.Tensor([[6, 7, 8], [9, 10, 11]], dtype, device))
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_to_legacy(device):
     dtype = o3c.float32
 
@@ -180,7 +180,7 @@ def test_extrude_linear():
     assert ans.line.indices.shape == (1, 2)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_pickle(device):
     pcd = o3d.t.geometry.PointCloud(device)
     with tempfile.TemporaryDirectory() as temp_dir:
diff --git a/python/test/t/geometry/test_tensormap.py b/python/test/t/geometry/test_tensormap.py
index 255c1ef55b2..18a9b70aa07 100644
--- a/python/test/t/geometry/test_tensormap.py
+++ b/python/test/t/geometry/test_tensormap.py
@@ -22,7 +22,7 @@ class WrongType():
     pass
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_tensormap(device):
     dtype = o3c.float32
 
@@ -103,7 +103,7 @@ def test_tensormap(device):
     assert primary_key == "positions"
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_tensormap_modify(device):
     # Assigning to the *elements* of an alias will change the value in the map.
     # This tests that the alias shares the same memory as the tensor in the map.
@@ -170,7 +170,7 @@ def test_tensormap_modify(device):
     np.testing.assert_equal(tm.b.cpu().numpy(), [100])
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_tensor_dict_modify(device):
     """
     Same as test_tensormap_modify(), but we put Tensors in a python dict.
@@ -301,7 +301,7 @@ def test_numpy_dict_modify():
     np.testing.assert_equal(tm["b"], [100])
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_pickle(device):
     tm = o3d.t.geometry.TensorMap("positions")
     with tempfile.TemporaryDirectory() as temp_dir:
diff --git a/python/test/t/geometry/test_trianglemesh.py b/python/test/t/geometry/test_trianglemesh.py
index 88678e3f877..c74aeb67d26 100644
--- a/python/test/t/geometry/test_trianglemesh.py
+++ b/python/test/t/geometry/test_trianglemesh.py
@@ -34,7 +34,7 @@ def test_slice_plane():
     assert slices.line.indices.shape == (9, 2)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_create_box(device):
     # Test with default parameters.
     box_default = o3d.t.geometry.TriangleMesh.create_box(device=device)
@@ -118,7 +118,7 @@ def test_create_box(device):
     assert box_custom.triangle.indices.allclose(triangle_indices_custom)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_create_sphere(device):
     # Test with custom parameters.
     sphere_custom = o3d.t.geometry.TriangleMesh.create_sphere(
@@ -180,7 +180,7 @@ def test_create_sphere(device):
     assert sphere_custom.triangle.indices.allclose(triangle_indices_custom)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_create_tetrahedron(device):
     # Test with custom parameters.
     tetrahedron_custom = o3d.t.geometry.TriangleMesh.create_tetrahedron(
@@ -204,7 +204,7 @@ def test_create_tetrahedron(device):
     assert tetrahedron_custom.triangle.indices.allclose(triangle_indices_custom)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_create_octahedron(device):
     # Test with custom parameters.
     octahedron_custom = o3d.t.geometry.TriangleMesh.create_octahedron(
@@ -242,7 +242,7 @@ def test_create_octahedron(device):
     assert octahedron_custom.triangle.indices.allclose(triangle_indices_custom)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_create_icosahedron(device):
     # Test with custom parameters.
     icosahedron_custom = o3d.t.geometry.TriangleMesh.create_icosahedron(
@@ -298,7 +298,7 @@ def test_create_icosahedron(device):
     assert icosahedron_custom.triangle.indices.allclose(triangle_indices_custom)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_create_cylinder(device):
     # Test with custom parameters.
     cylinder_custom = o3d.t.geometry.TriangleMesh.create_cylinder(
@@ -360,7 +360,7 @@ def test_create_cylinder(device):
     assert cylinder_custom.triangle.indices.allclose(triangle_indices_custom)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_create_cone(device):
     # Test with custom parameters.
     cone_custom = o3d.t.geometry.TriangleMesh.create_cone(
@@ -404,7 +404,7 @@ def test_create_cone(device):
     assert cone_custom.triangle.indices.allclose(triangle_indices_custom)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_create_torus(device):
     # Test with custom parameters.
     torus_custom = o3d.t.geometry.TriangleMesh.create_torus(
@@ -482,7 +482,7 @@ def test_create_torus(device):
     assert torus_custom.triangle.indices.allclose(triangle_indices_custom)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_create_arrow(device):
     # Test with custom parameters.
     arrow_custom = o3d.t.geometry.TriangleMesh.create_arrow(
@@ -546,7 +546,7 @@ def test_create_arrow(device):
     assert arrow_custom.triangle.indices.allclose(triangle_indices_custom)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_create_mobius(device):
     # Test with custom parameters.
     mobius_custom = o3d.t.geometry.TriangleMesh.create_mobius(
@@ -818,7 +818,7 @@ def test_extrude_linear():
     assert ans.triangle.indices.shape == (8, 3)
 
 
-@pytest.mark.parametrize("device", list_devices())
+@pytest.mark.parametrize("device", list_devices(enable_sycl=True))
 def test_pickle(device):
     mesh = o3d.t.geometry.TriangleMesh.create_box().to(device)
     with tempfile.TemporaryDirectory() as temp_dir:
diff --git a/util/ci_utils.sh b/util/ci_utils.sh
index ce921111092..b6110110d7b 100644
--- a/util/ci_utils.sh
+++ b/util/ci_utils.sh
@@ -21,6 +21,7 @@ fi
 BUILD_TENSORFLOW_OPS=${BUILD_TENSORFLOW_OPS:-ON}
 BUILD_PYTORCH_OPS=${BUILD_PYTORCH_OPS:-ON}
 LOW_MEM_USAGE=${LOW_MEM_USAGE:-OFF}
+BUILD_SYCL_MODULE=${BUILD_SYCL_MODULE:-OFF}
 
 # Dependency versions:
 # CUDA: see docker/docker_build.sh
@@ -28,6 +29,7 @@ LOW_MEM_USAGE=${LOW_MEM_USAGE:-OFF}
 TENSORFLOW_VER="2.16.2"
 TORCH_VER="2.2.2"
 TORCH_REPO_URL="https://download.pytorch.org/whl/torch/"
+TORCH_CXX11_URL="https://download.pytorch.org/whl/"
 # Python
 PIP_VER="24.3.1"
 PROTOBUF_VER="4.24.0"
@@ -73,9 +75,12 @@ install_python_dependencies() {
         python -m pip install -U "$TF_ARCH_NAME"=="$TENSORFLOW_VER" # ML/requirements-tensorflow.txt
     fi
     if [ "$BUILD_PYTORCH_OPS" == "ON" ]; then # ML/requirements-torch.txt
-        if [[ "$OSTYPE" == "linux-gnu"* ]]; then
-            python -m pip install -U "${TORCH_GLNX}" -f "$TORCH_REPO_URL" tensorboard
-
+        if [[ "$OSTYPE" == "linux-gnu"* && "$BUILD_SYCL_MODULE" == "OFF" ]]; then
+            python -m pip install -U "${TORCH_GLNX}" -f "$TORCH_REPO_URL" 
+            python -m pip install tensorboard
+        elif [[ "$OSTYPE" == "linux-gnu"* && "$BUILD_SYCL_MODULE" == "ON" ]]; then
+            python -m pip install -U "${TORCH_GLNX}.cxx11.abi" -i "$TORCH_CXX11_URL"
+            python -m pip install tensorboard
         elif [[ "$OSTYPE" == "darwin"* ]]; then
             python -m pip install -U torch=="$TORCH_VER" -f "$TORCH_REPO_URL" tensorboard
         else