[2/4] Intel GPU Runtime Upstreaming for Device (pytorch#116833)

# Motivation According to [[1/4] Intel GPU Runtime Upstreaming for Device](pytorch#116019), as mentioned in [[RFC] Intel GPU Runtime Upstreaming](pytorch#114842), the second PR covers the changes under `aten`. # Design We will compile the code for XPU separately into a library named `libtorch_xpu.so`. Currently, it primarily offers device-related APIs, including - `getCurrentDeviceProperties` - `getDeviceProperties` - `getGlobalIdxFromDevice` - `getDeviceFromPtr` # Additional Context `XPUHooks` is an indispensable part of the runtime. We upstream `XPUHooks` in this PR since there is some code related to `Device` in it and we also refine some logic and code to avoid forward declaration in `DLPack`. Pull Request resolved: pytorch#116833 Approved by: https://github.com/EikanWang, https://github.com/jgong5, https://github.com/gujinghui, https://github.com/malfet
Bowen12992 · Jan 18, 2024 · 79811e7 · 79811e7
1 parent 61ea303
commit 79811e7
Show file tree

Hide file tree

Showing 19 changed files with 369 additions and 43 deletions.
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -57,6 +57,8 @@ code = 'CLANGFORMAT'
 include_patterns = [
     'aten/src/ATen/*.h',
     'aten/src/ATen/mps/**/*.mm',
+    'aten/src/ATen/xpu/**/*.h',
+    'aten/src/ATen/xpu/**/*.cpp',
     'aten/src/ATen/native/mps/**/*.mm',
     'aten/src/ATen/native/vulkan/**/*.h',
     'aten/src/ATen/native/vulkan/**/*.cpp',

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1171,6 +1171,10 @@ if(BUILD_SHARED_LIBS)
       ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUSPARSELT.cmake
       DESTINATION share/cmake/Caffe2/
       COMPONENT dev)
+  install(FILES
+      ${PROJECT_SOURCE_DIR}/cmake/Modules/FindSYCLToolkit.cmake
+      DESTINATION share/cmake/Caffe2/
+      COMPONENT dev)
 
   install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
       FILE Caffe2Targets.cmake

diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
@@ -34,6 +34,9 @@ set(ATen_HIP_TEST_SRCS)
 set(ATen_HIP_INCLUDE)
 set(ATen_MPS_SRCS)
 set(ATen_MPS_TEST_SRCS)
+set(ATen_XPU_SRCS)
+set(ATen_XPU_INCLUDE)
+set(ATen_XPU_TEST_SRCS)
 set(ATen_VULKAN_TEST_SRCS)
 set(ATen_CPU_DEPENDENCY_LIBS)
 set(ATen_CUDA_DEPENDENCY_LIBS)
@@ -111,6 +114,8 @@ set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
 set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
 set(ATen_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS} PARENT_SCOPE)
 set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
+set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
+set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
@@ -122,6 +127,7 @@ set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
 set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
+set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
 set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -69,6 +69,9 @@ file(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh")
 file(GLOB cudnn_cpp "cudnn/*.cpp")
 file(GLOB ops_h "ops/*.h")
 
+file(GLOB xpu_h "xpu/*.h" "xpu/detail/*.h")
+file(GLOB xpu_cpp "xpu/*.cpp" "xpu/detail/*.cpp")
+
 file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h")
 file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp")
 list(REMOVE_ITEM hip_cpp "${CMAKE_CURRENT_SOURCE_DIR}/hip/detail/LazyNVRTC.cpp")
@@ -321,6 +324,11 @@ if(USE_ROCM)
   )
 endif()
 
+if(USE_XPU)
+  list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/xpu)
+  list(APPEND ATen_XPU_SRCS ${xpu_cpp})
+endif()
+
 list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..)
 
 if(USE_TBB)
@@ -579,7 +587,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
 
 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
-  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
+  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${xpu_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
   # Metal
   if(USE_PYTORCH_METAL_EXPORT)
     # Add files needed from exporting metal models(optimized_for_mobile)
@@ -649,9 +657,11 @@ set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SC
 set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
 set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
 set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
+set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
 set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
+set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
 set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
 set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
@@ -664,6 +674,7 @@ set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
 set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
+set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
 set(ATen_VULKAN_INCLUDE ${ATen_VULKAN_INCLUDE} PARENT_SCOPE)
 set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -63,6 +63,8 @@ class TORCH_API Context {
       return c10::DeviceType::CPU;
     } else if (device_type == at::kCUDA) {
       return at::detail::getCUDAHooks().getDeviceFromPtr(data);
+    } else if (device_type == at::kXPU) {
+      return at::detail::getXPUHooks().getDeviceFromPtr(data);
     } else if (device_type == at::kPrivateUse1) {
       return at::GetPrivateUse1HooksInterface()->getDeviceFromPtr(data);
     } else {
@@ -481,19 +483,6 @@ static inline void manual_seed(uint64_t seed) {
     }
   }
 
-  const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs();
-  if (hasXPU() && xpu_num_gpus > 0) {
-    for (const auto i : c10::irange(xpu_num_gpus)) {
-      auto xpu_gen = globalContext().defaultGenerator(
-          Device(at::kXPU, static_cast<c10::DeviceIndex>(i)));
-      {
-        // See Note [Acquire lock when using random generators]
-        std::lock_guard<std::mutex> lock(xpu_gen.mutex());
-        xpu_gen.set_current_seed(seed);
-      }
-    }
-  }
-
   if (hasMPS()) {
     auto mps_gen = globalContext().defaultGenerator(c10::DeviceType::MPS);
     // See Note [Acquire lock when using random generators]

diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
@@ -111,8 +111,9 @@ static DLDevice getDLDevice(const Tensor& tensor, const int64_t& device_id) {
       ctx.device_type = DLDeviceType::kDLROCM;
       break;
     case DeviceType::XPU:
-      ctx = at::detail::getXPUHooks().getDLPackDeviceFromATenDevice(
-          ctx, tensor.device(), tensor.data_ptr());
+      ctx.device_type = DLDeviceType::kDLOneAPI;
+      ctx.device_id =
+          at::detail::getXPUHooks().getGlobalIdxFromDevice(tensor.device());
       break;
     default:
       TORCH_CHECK(false, "Cannot pack tensors on " + tensor.device().str());
@@ -139,7 +140,7 @@ static Device getATenDevice(const DLDevice& ctx, void* data) {
       return at::Device(DeviceType::HIP, ctx.device_id);
 #endif
     case DLDeviceType::kDLOneAPI:
-      return at::detail::getXPUHooks().getATenDeviceFromDLPackDevice(ctx, data);
+      return at::detail::getXPUHooks().getDeviceFromPtr(data);
     default:
       TORCH_CHECK(
           false, "Unsupported device_type: " + c10::to_string(ctx.device_type));

diff --git a/aten/src/ATen/detail/XPUHooksInterface.h b/aten/src/ATen/detail/XPUHooksInterface.h
@@ -9,11 +9,6 @@
 #include <functional>
 #include <memory>
 
-// We use forward declaration here instead of #include <ATen/dlpack.h> to avoid
-// leaking DLPack implementation detail to every project that includes `ATen/Context.h`, which in turn
-// would lead to a conflict when linked with another project using DLPack (for example TVM)
-struct DLDevice_;
-
 namespace at {
 
 constexpr const char* XPU_HELP =
@@ -44,23 +39,8 @@ struct TORCH_API XPUHooksInterface {
         XPU_HELP);
   }
 
-  virtual Device getATenDeviceFromDLPackDevice(
-      const DLDevice_& dl_device,
-      void* data) const {
-    TORCH_CHECK(
-        false,
-        "Cannot get XPU device without Intel Extension for Pytorch. ",
-        XPU_HELP);
-  }
-
-  virtual DLDevice_& getDLPackDeviceFromATenDevice(
-      DLDevice_& dl_device,
-      const Device& aten_device,
-      void* data) const {
-    TORCH_CHECK(
-        false,
-        "Cannot get XPU DL device without Intel Extension for Pytorch. ",
-        XPU_HELP);
+  virtual int getGlobalIdxFromDevice(const Device& device) const {
+    TORCH_CHECK(false, "Cannot get XPU global device index without ATen_xpu library.");
   }
 
   virtual Generator getXPUGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
@@ -74,6 +54,10 @@ struct TORCH_API XPUHooksInterface {
   virtual int getNumGPUs() const {
     return 0;
   }
+
+  virtual Device getDeviceFromPtr(void* /*data*/) const {
+    TORCH_CHECK(false, "Cannot get device of pointer on XPU without ATen_xpu library.");
+  }
 };
 
 struct TORCH_API XPUHooksArgs {};

diff --git a/aten/src/ATen/dlpack.h b/aten/src/ATen/dlpack.h
@@ -94,10 +94,7 @@ typedef enum {
 /*!
  * \brief A Device for Tensor and operator.
  */
-// NB: This is the only difference from
-// https://github.com/dmlc/dlpack/blob/v0.7/include/dlpack/dlpack.h Required to
-// allow forward declaration of DLDevice.
-typedef struct DLDevice_ {
+typedef struct {
   /*! \brief The device type used in the device. */
   DLDeviceType device_type;
   /*!

diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
@@ -116,6 +116,10 @@ if(APPLE AND USE_MPS)
     ${CMAKE_CURRENT_SOURCE_DIR}/mps_test_objc_interface.mm)
 endif()
 
+list(APPEND ATen_XPU_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/xpu_device_test.cpp
+  )
+
 # Caffe2 specific tests
 if(BUILD_CAFFE2)
   list(APPEND ATen_CPU_TEST_SRCS
@@ -134,3 +138,4 @@ set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
 set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} PARENT_SCOPE)
 set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)
 set(ATen_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS} PARENT_SCOPE)
+set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/test/xpu_device_test.cpp b/aten/src/ATen/test/xpu_device_test.cpp
@@ -0,0 +1,69 @@
+#include <gtest/gtest.h>
+
+#include <aten/src/ATen/xpu/XPUContext.h>
+#include <aten/src/ATen/xpu/XPUDevice.h>
+#include <c10/xpu/XPUFunctions.h>
+
+#define ASSERT_EQ_XPU(X, Y) \
+  {                         \
+    bool _isEQ = X == Y;    \
+    ASSERT_TRUE(_isEQ);     \
+  }
+
+TEST(XpuDeviceTest, getDeviceProperties) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  c10::xpu::DeviceProp* cur_device_prop = at::xpu::getCurrentDeviceProperties();
+  c10::xpu::DeviceProp* device_prop = at::xpu::getDeviceProperties(0);
+
+  ASSERT_EQ_XPU(cur_device_prop->name, device_prop->name);
+  ASSERT_EQ_XPU(cur_device_prop->platform_name, device_prop->platform_name);
+  ASSERT_EQ_XPU(cur_device_prop->gpu_eu_count, device_prop->gpu_eu_count);
+}
+
+TEST(XpuDeviceTest, getDeviceFromPtr) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  sycl::device& raw_device = at::xpu::get_raw_device(0);
+  void* ptr = sycl::malloc_device(8, raw_device, at::xpu::get_device_context());
+
+  at::Device device = at::xpu::getDeviceFromPtr(ptr);
+  sycl::free(ptr, at::xpu::get_device_context());
+  ASSERT_EQ_XPU(device.index(), 0);
+  ASSERT_EQ_XPU(device.type(), at::kXPU);
+
+  int dummy = 0;
+  ASSERT_THROW(at::xpu::getDeviceFromPtr(&dummy), c10::Error);
+}
+
+TEST(XpuDeviceTest, getGlobalIdxFromDevice) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  int target_device = 0;
+  auto global_index = at::xpu::getGlobalIdxFromDevice(target_device);
+  auto devices = sycl::device::get_devices();
+  ASSERT_EQ_XPU(devices[global_index], at::xpu::get_raw_device(target_device));
+
+  void* ptr = sycl::malloc_device(8, devices[global_index], at::xpu::get_device_context());
+  at::Device device = at::xpu::getDeviceFromPtr(ptr);
+  sycl::free(ptr, at::xpu::get_device_context());
+  ASSERT_EQ_XPU(device.index(), target_device);
+  ASSERT_EQ_XPU(device.type(), at::kXPU);
+
+  if (at::xpu::device_count() == 1) {
+    return;
+  }
+  // Test the last device.
+  target_device = at::xpu::device_count() - 1;
+  global_index = at::xpu::getGlobalIdxFromDevice(target_device);
+  ASSERT_EQ_XPU(devices[global_index], at::xpu::get_raw_device(target_device));
+
+  target_device = at::xpu::device_count();
+  ASSERT_THROW(at::xpu::getGlobalIdxFromDevice(target_device), c10::Error);
+}
diff --git a/aten/src/ATen/xpu/XPUContext.cpp b/aten/src/ATen/xpu/XPUContext.cpp
@@ -0,0 +1,89 @@
+#include <ATen/xpu/XPUContext.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+
+#include <cmath>
+#include <deque>
+#include <mutex>
+#include <vector>
+
+namespace at::xpu {
+namespace {
+
+/*
+ * Currently, there is one device properties pool containing the information and
+ * capability about each compute-device.
+ *
+ * Device properties are lazily initialized when the first time properties are
+ * requested for a device.
+ */
+DeviceIndex num_gpus = -1;
+c10::once_flag init_flag;
+std::deque<c10::once_flag> device_prop_flags;
+std::vector<DeviceProp> device_properties;
+
+std::deque<c10::once_flag> device_global_idx_flags;
+std::vector<int32_t> device_global_idxs;
+
+void initXPUContextVectors() {
+  num_gpus = c10::xpu::device_count();
+  device_prop_flags.resize(num_gpus);
+  device_properties.resize(num_gpus);
+  device_global_idx_flags.resize(num_gpus);
+  device_global_idxs.resize(num_gpus);
+}
+
+void initDeviceProperty(DeviceIndex device) {
+  c10::xpu::get_device_properties(&device_properties[device], device);
+}
+
+void initDeviceGlobalIdx(DeviceIndex device) {
+  sycl::device& raw_device = c10::xpu::get_raw_device(device);
+  // Get all SYCL devices associated with the SYCL platform.
+  auto devices = sycl::device::get_devices();
+  auto match_device = [raw_device](const auto& dev) -> bool {
+    return raw_device == dev;
+  };
+  auto it = std::find_if(devices.begin(), devices.end(), match_device);
+  TORCH_CHECK(
+      it != devices.end(), "Cant't find the global index of XPU device.");
+  device_global_idxs[device] =
+      static_cast<int32_t>(std::distance(devices.begin(), it));
+}
+
+inline void check_device(DeviceIndex device) {
+  TORCH_CHECK(
+      device >= 0 && device < num_gpus,
+      "device is out of range, device is ",
+      static_cast<int>(device),
+      ", total number of device is ",
+      static_cast<int>(num_gpus),
+      ".");
+}
+
+} // anonymous namespace
+
+DeviceProp* getCurrentDeviceProperties() {
+  auto device = c10::xpu::current_device();
+  return getDeviceProperties(device);
+}
+
+DeviceProp* getDeviceProperties(DeviceIndex device) {
+  c10::call_once(init_flag, initXPUContextVectors);
+  if (device == -1)
+    device = c10::xpu::current_device();
+  check_device(device);
+  c10::call_once(device_prop_flags[device], initDeviceProperty, device);
+  return &device_properties[device];
+}
+
+// Return the global index enumerated by sycl::device::get_devices based on the
+// index of a XPU device in the framework.
+int32_t getGlobalIdxFromDevice(DeviceIndex device) {
+  c10::call_once(init_flag, initXPUContextVectors);
+  check_device(device);
+  c10::call_once(device_global_idx_flags[device], initDeviceGlobalIdx, device);
+  return device_global_idxs[device];
+}
+
+} // namespace at::xpu