Skip to content

Commit

Permalink
[2/4] Intel GPU Runtime Upstreaming for Device (pytorch#116833)
Browse files Browse the repository at this point in the history
# Motivation
According to [[1/4] Intel GPU Runtime Upstreaming for Device](pytorch#116019), as mentioned in [[RFC] Intel GPU Runtime Upstreaming](pytorch#114842), the second PR  covers the changes under `aten`.

# Design
We will compile the code for XPU separately into a library named `libtorch_xpu.so`. Currently, it primarily offers device-related APIs, including
- `getCurrentDeviceProperties`
- `getDeviceProperties`
- `getGlobalIdxFromDevice`
- `getDeviceFromPtr`

# Additional Context
`XPUHooks` is an indispensable part of the runtime. We upstream `XPUHooks` in this PR since there is some code related to `Device` in it and we also refine some logic and code to avoid forward declaration in `DLPack`.

Pull Request resolved: pytorch#116833
Approved by: https://github.com/EikanWang, https://github.com/jgong5, https://github.com/gujinghui, https://github.com/malfet
  • Loading branch information
guangyey authored and pytorchmergebot committed Jan 18, 2024
1 parent 61ea303 commit 79811e7
Show file tree
Hide file tree
Showing 19 changed files with 369 additions and 43 deletions.
2 changes: 2 additions & 0 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ code = 'CLANGFORMAT'
include_patterns = [
'aten/src/ATen/*.h',
'aten/src/ATen/mps/**/*.mm',
'aten/src/ATen/xpu/**/*.h',
'aten/src/ATen/xpu/**/*.cpp',
'aten/src/ATen/native/mps/**/*.mm',
'aten/src/ATen/native/vulkan/**/*.h',
'aten/src/ATen/native/vulkan/**/*.cpp',
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1171,6 +1171,10 @@ if(BUILD_SHARED_LIBS)
${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUSPARSELT.cmake
DESTINATION share/cmake/Caffe2/
COMPONENT dev)
install(FILES
${PROJECT_SOURCE_DIR}/cmake/Modules/FindSYCLToolkit.cmake
DESTINATION share/cmake/Caffe2/
COMPONENT dev)

install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
FILE Caffe2Targets.cmake
Expand Down
6 changes: 6 additions & 0 deletions aten/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ set(ATen_HIP_TEST_SRCS)
set(ATen_HIP_INCLUDE)
set(ATen_MPS_SRCS)
set(ATen_MPS_TEST_SRCS)
set(ATen_XPU_SRCS)
set(ATen_XPU_INCLUDE)
set(ATen_XPU_TEST_SRCS)
set(ATen_VULKAN_TEST_SRCS)
set(ATen_CPU_DEPENDENCY_LIBS)
set(ATen_CUDA_DEPENDENCY_LIBS)
Expand Down Expand Up @@ -111,6 +114,8 @@ set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
set(ATen_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS} PARENT_SCOPE)
set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
Expand All @@ -122,6 +127,7 @@ set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)
set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
Expand Down
13 changes: 12 additions & 1 deletion aten/src/ATen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ file(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh")
file(GLOB cudnn_cpp "cudnn/*.cpp")
file(GLOB ops_h "ops/*.h")

file(GLOB xpu_h "xpu/*.h" "xpu/detail/*.h")
file(GLOB xpu_cpp "xpu/*.cpp" "xpu/detail/*.cpp")

file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h")
file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp")
list(REMOVE_ITEM hip_cpp "${CMAKE_CURRENT_SOURCE_DIR}/hip/detail/LazyNVRTC.cpp")
Expand Down Expand Up @@ -321,6 +324,11 @@ if(USE_ROCM)
)
endif()

if(USE_XPU)
list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/xpu)
list(APPEND ATen_XPU_SRCS ${xpu_cpp})
endif()

list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..)

if(USE_TBB)
Expand Down Expand Up @@ -579,7 +587,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"

set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
if(NOT INTERN_BUILD_MOBILE)
list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${xpu_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
# Metal
if(USE_PYTORCH_METAL_EXPORT)
# Add files needed from exporting metal models(optimized_for_mobile)
Expand Down Expand Up @@ -649,9 +657,11 @@ set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SC
set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
Expand All @@ -664,6 +674,7 @@ set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
set(ATen_VULKAN_INCLUDE ${ATen_VULKAN_INCLUDE} PARENT_SCOPE)
set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
Expand Down
15 changes: 2 additions & 13 deletions aten/src/ATen/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ class TORCH_API Context {
return c10::DeviceType::CPU;
} else if (device_type == at::kCUDA) {
return at::detail::getCUDAHooks().getDeviceFromPtr(data);
} else if (device_type == at::kXPU) {
return at::detail::getXPUHooks().getDeviceFromPtr(data);
} else if (device_type == at::kPrivateUse1) {
return at::GetPrivateUse1HooksInterface()->getDeviceFromPtr(data);
} else {
Expand Down Expand Up @@ -481,19 +483,6 @@ static inline void manual_seed(uint64_t seed) {
}
}

const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs();
if (hasXPU() && xpu_num_gpus > 0) {
for (const auto i : c10::irange(xpu_num_gpus)) {
auto xpu_gen = globalContext().defaultGenerator(
Device(at::kXPU, static_cast<c10::DeviceIndex>(i)));
{
// See Note [Acquire lock when using random generators]
std::lock_guard<std::mutex> lock(xpu_gen.mutex());
xpu_gen.set_current_seed(seed);
}
}
}

if (hasMPS()) {
auto mps_gen = globalContext().defaultGenerator(c10::DeviceType::MPS);
// See Note [Acquire lock when using random generators]
Expand Down
7 changes: 4 additions & 3 deletions aten/src/ATen/DLConvertor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,9 @@ static DLDevice getDLDevice(const Tensor& tensor, const int64_t& device_id) {
ctx.device_type = DLDeviceType::kDLROCM;
break;
case DeviceType::XPU:
ctx = at::detail::getXPUHooks().getDLPackDeviceFromATenDevice(
ctx, tensor.device(), tensor.data_ptr());
ctx.device_type = DLDeviceType::kDLOneAPI;
ctx.device_id =
at::detail::getXPUHooks().getGlobalIdxFromDevice(tensor.device());
break;
default:
TORCH_CHECK(false, "Cannot pack tensors on " + tensor.device().str());
Expand All @@ -139,7 +140,7 @@ static Device getATenDevice(const DLDevice& ctx, void* data) {
return at::Device(DeviceType::HIP, ctx.device_id);
#endif
case DLDeviceType::kDLOneAPI:
return at::detail::getXPUHooks().getATenDeviceFromDLPackDevice(ctx, data);
return at::detail::getXPUHooks().getDeviceFromPtr(data);
default:
TORCH_CHECK(
false, "Unsupported device_type: " + c10::to_string(ctx.device_type));
Expand Down
28 changes: 6 additions & 22 deletions aten/src/ATen/detail/XPUHooksInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,6 @@
#include <functional>
#include <memory>

// We use forward declaration here instead of #include <ATen/dlpack.h> to avoid
// leaking DLPack implementation detail to every project that includes `ATen/Context.h`, which in turn
// would lead to a conflict when linked with another project using DLPack (for example TVM)
struct DLDevice_;

namespace at {

constexpr const char* XPU_HELP =
Expand Down Expand Up @@ -44,23 +39,8 @@ struct TORCH_API XPUHooksInterface {
XPU_HELP);
}

virtual Device getATenDeviceFromDLPackDevice(
const DLDevice_& dl_device,
void* data) const {
TORCH_CHECK(
false,
"Cannot get XPU device without Intel Extension for Pytorch. ",
XPU_HELP);
}

virtual DLDevice_& getDLPackDeviceFromATenDevice(
DLDevice_& dl_device,
const Device& aten_device,
void* data) const {
TORCH_CHECK(
false,
"Cannot get XPU DL device without Intel Extension for Pytorch. ",
XPU_HELP);
virtual int getGlobalIdxFromDevice(const Device& device) const {
TORCH_CHECK(false, "Cannot get XPU global device index without ATen_xpu library.");
}

virtual Generator getXPUGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
Expand All @@ -74,6 +54,10 @@ struct TORCH_API XPUHooksInterface {
virtual int getNumGPUs() const {
return 0;
}

virtual Device getDeviceFromPtr(void* /*data*/) const {
TORCH_CHECK(false, "Cannot get device of pointer on XPU without ATen_xpu library.");
}
};

struct TORCH_API XPUHooksArgs {};
Expand Down
5 changes: 1 addition & 4 deletions aten/src/ATen/dlpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,7 @@ typedef enum {
/*!
* \brief A Device for Tensor and operator.
*/
// NB: This is the only difference from
// https://github.com/dmlc/dlpack/blob/v0.7/include/dlpack/dlpack.h Required to
// allow forward declaration of DLDevice.
typedef struct DLDevice_ {
typedef struct {
/*! \brief The device type used in the device. */
DLDeviceType device_type;
/*!
Expand Down
5 changes: 5 additions & 0 deletions aten/src/ATen/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ if(APPLE AND USE_MPS)
${CMAKE_CURRENT_SOURCE_DIR}/mps_test_objc_interface.mm)
endif()

list(APPEND ATen_XPU_TEST_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/xpu_device_test.cpp
)

# Caffe2 specific tests
if(BUILD_CAFFE2)
list(APPEND ATen_CPU_TEST_SRCS
Expand All @@ -134,3 +138,4 @@ set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} PARENT_SCOPE)
set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)
set(ATen_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS} PARENT_SCOPE)
set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
69 changes: 69 additions & 0 deletions aten/src/ATen/test/xpu_device_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include <gtest/gtest.h>

#include <aten/src/ATen/xpu/XPUContext.h>
#include <aten/src/ATen/xpu/XPUDevice.h>
#include <c10/xpu/XPUFunctions.h>

#define ASSERT_EQ_XPU(X, Y) \
{ \
bool _isEQ = X == Y; \
ASSERT_TRUE(_isEQ); \
}

TEST(XpuDeviceTest, getDeviceProperties) {
if (!at::xpu::is_available()) {
return;
}

c10::xpu::DeviceProp* cur_device_prop = at::xpu::getCurrentDeviceProperties();
c10::xpu::DeviceProp* device_prop = at::xpu::getDeviceProperties(0);

ASSERT_EQ_XPU(cur_device_prop->name, device_prop->name);
ASSERT_EQ_XPU(cur_device_prop->platform_name, device_prop->platform_name);
ASSERT_EQ_XPU(cur_device_prop->gpu_eu_count, device_prop->gpu_eu_count);
}

TEST(XpuDeviceTest, getDeviceFromPtr) {
if (!at::xpu::is_available()) {
return;
}

sycl::device& raw_device = at::xpu::get_raw_device(0);
void* ptr = sycl::malloc_device(8, raw_device, at::xpu::get_device_context());

at::Device device = at::xpu::getDeviceFromPtr(ptr);
sycl::free(ptr, at::xpu::get_device_context());
ASSERT_EQ_XPU(device.index(), 0);
ASSERT_EQ_XPU(device.type(), at::kXPU);

int dummy = 0;
ASSERT_THROW(at::xpu::getDeviceFromPtr(&dummy), c10::Error);
}

TEST(XpuDeviceTest, getGlobalIdxFromDevice) {
if (!at::xpu::is_available()) {
return;
}

int target_device = 0;
auto global_index = at::xpu::getGlobalIdxFromDevice(target_device);
auto devices = sycl::device::get_devices();
ASSERT_EQ_XPU(devices[global_index], at::xpu::get_raw_device(target_device));

void* ptr = sycl::malloc_device(8, devices[global_index], at::xpu::get_device_context());
at::Device device = at::xpu::getDeviceFromPtr(ptr);
sycl::free(ptr, at::xpu::get_device_context());
ASSERT_EQ_XPU(device.index(), target_device);
ASSERT_EQ_XPU(device.type(), at::kXPU);

if (at::xpu::device_count() == 1) {
return;
}
// Test the last device.
target_device = at::xpu::device_count() - 1;
global_index = at::xpu::getGlobalIdxFromDevice(target_device);
ASSERT_EQ_XPU(devices[global_index], at::xpu::get_raw_device(target_device));

target_device = at::xpu::device_count();
ASSERT_THROW(at::xpu::getGlobalIdxFromDevice(target_device), c10::Error);
}
89 changes: 89 additions & 0 deletions aten/src/ATen/xpu/XPUContext.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#include <ATen/xpu/XPUContext.h>
#include <c10/util/CallOnce.h>
#include <c10/util/Exception.h>

#include <cmath>
#include <deque>
#include <mutex>
#include <vector>

namespace at::xpu {
namespace {

/*
* Currently, there is one device properties pool containing the information and
* capability about each compute-device.
*
* Device properties are lazily initialized when the first time properties are
* requested for a device.
*/
DeviceIndex num_gpus = -1;
c10::once_flag init_flag;
std::deque<c10::once_flag> device_prop_flags;
std::vector<DeviceProp> device_properties;

std::deque<c10::once_flag> device_global_idx_flags;
std::vector<int32_t> device_global_idxs;

void initXPUContextVectors() {
num_gpus = c10::xpu::device_count();
device_prop_flags.resize(num_gpus);
device_properties.resize(num_gpus);
device_global_idx_flags.resize(num_gpus);
device_global_idxs.resize(num_gpus);
}

void initDeviceProperty(DeviceIndex device) {
c10::xpu::get_device_properties(&device_properties[device], device);
}

void initDeviceGlobalIdx(DeviceIndex device) {
sycl::device& raw_device = c10::xpu::get_raw_device(device);
// Get all SYCL devices associated with the SYCL platform.
auto devices = sycl::device::get_devices();
auto match_device = [raw_device](const auto& dev) -> bool {
return raw_device == dev;
};
auto it = std::find_if(devices.begin(), devices.end(), match_device);
TORCH_CHECK(
it != devices.end(), "Cant't find the global index of XPU device.");
device_global_idxs[device] =
static_cast<int32_t>(std::distance(devices.begin(), it));
}

inline void check_device(DeviceIndex device) {
TORCH_CHECK(
device >= 0 && device < num_gpus,
"device is out of range, device is ",
static_cast<int>(device),
", total number of device is ",
static_cast<int>(num_gpus),
".");
}

} // anonymous namespace

DeviceProp* getCurrentDeviceProperties() {
auto device = c10::xpu::current_device();
return getDeviceProperties(device);
}

DeviceProp* getDeviceProperties(DeviceIndex device) {
c10::call_once(init_flag, initXPUContextVectors);
if (device == -1)
device = c10::xpu::current_device();
check_device(device);
c10::call_once(device_prop_flags[device], initDeviceProperty, device);
return &device_properties[device];
}

// Return the global index enumerated by sycl::device::get_devices based on the
// index of a XPU device in the framework.
int32_t getGlobalIdxFromDevice(DeviceIndex device) {
c10::call_once(init_flag, initXPUContextVectors);
check_device(device);
c10::call_once(device_global_idx_flags[device], initDeviceGlobalIdx, device);
return device_global_idxs[device];
}

} // namespace at::xpu
Loading

0 comments on commit 79811e7

Please sign in to comment.