Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync main into release/2.6 branch #1117

Merged
merged 5 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/scripts/apply_torch_pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
"https://github.com/pytorch/pytorch/pull/126516",
# Modify the tolerance level in TIMM benchmark
"https://github.com/pytorch/pytorch/pull/129735",
# [XPU] Update XPU C Shim Header
"https://github.com/pytorch/pytorch/pull/141086",
]
)
parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])
Expand Down
3 changes: 2 additions & 1 deletion .github/scripts/env.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
source /opt/intel/oneapi/compiler/latest/env/vars.sh
source /opt/intel/oneapi/umf/latest/env/vars.sh
source /opt/intel/oneapi/pti/latest/env/vars.sh
2 changes: 1 addition & 1 deletion .github/workflows/_linux_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ jobs:
run: |
source activate xpu_op_${ZE_AFFINITY_MASK}
source .github/scripts/env.sh
pip install mkl-static mkl-include
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
cd ../pytorch
if [[ ${{ inputs.abi }} == '0' ]]; then
export _GLIBCXX_USE_CXX11_ABI=0
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nightly_ondemand.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ jobs:
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
source activate e2e_ci
pip install mkl-static mkl-include
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nightly_ondemand_rolling.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ jobs:
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
source activate e2e_ci
pip install mkl-static mkl-include
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nightly_ondemand_whl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ jobs:
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
source activate e2e_ci
pip install mkl-static mkl-include
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
run: |
Expand Down
2 changes: 1 addition & 1 deletion cmake/BuildFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'")

if(WIN32)
set(AOT_TARGETS "ats-m150,lnl-m,mtl-u,mtl-h")
set(AOT_TARGETS "ats-m150,mtl-u,mtl-h,xe2-lpg,xe2-hpg")
else()
set(AOT_TARGETS "pvc,xe-lpg,ats-m150")
endif()
Expand Down
10 changes: 7 additions & 3 deletions src/ATen/native/xpu/sycl/MultiTensorApply.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ static inline int64_t multi_tensor_apply_fused_kernel_get_chunk_size() {
}

template <typename T, typename Y, typename U, typename... ArgTypes>
struct MultiTensorApplyKernelFunctor {
struct MultiTensorApplyKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
void operator()(sycl::nd_item<1> item_id) const {
// Expand the tuple elements manually and call the callable
expandAndCall(item_id, std::index_sequence_for<ArgTypes...>());
Expand All @@ -85,6 +85,12 @@ struct MultiTensorApplyKernelFunctor {
callable(callable_),
args(std::make_tuple(args_...)) {}

void sycl_ker_config_convention(sycl::handler& cgh) {
if constexpr (std::is_base_of_v<__SYCL_KER_CONFIG_CONVENTION__, U>) {
callable.sycl_ker_config_convention(cgh);
}
}

private:
template <std::size_t... Indices>
void expandAndCall(sycl::nd_item<1> item_id, std::index_sequence<Indices...>)
Expand Down Expand Up @@ -117,7 +123,6 @@ void launch_multi_tensor_apply_kernel(
U callable,
int num_wg,
ArgTypes... args) {

auto& q = getCurrentSYCLQueue();
int64_t simd = syclMaxSubGroupSize();
int64_t max_wg_size = multi_tensor_apply_kernel_get_wg_size(simd);
Expand Down Expand Up @@ -226,7 +231,6 @@ void multi_tensor_apply(
std::vector<std::vector<at::Tensor>>& tensor_lists,
T callable,
ArgTypes... args) {

TORCH_CHECK(
tensor_lists.size() == depth,
"Number of tensor lists has to match he depth");
Expand Down
61 changes: 38 additions & 23 deletions src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,41 +143,64 @@ struct alignas(N) OpaqueType {
char data[N];
};

template <int work_group_size, int thread_work_size, typename func_t>
template <typename func_t>
struct ScatterGatherElementwiseKernelFunctor {
void operator()(sycl::nd_item<1> item) const {
constexpr int nv = work_group_size * thread_work_size;
int nv = work_group_size_ * thread_work_size_;
auto wg_id = item.get_group_linear_id();
auto local_id = item.get_local_linear_id();
int idx = nv * wg_id + local_id;
#pragma unroll
for (int i = 0; i < thread_work_size; ++i) {
for (int i = 0; i < thread_work_size_; ++i) {
if (idx < N_) {
f_(idx);
idx += work_group_size;
idx += work_group_size_;
}
}
}
ScatterGatherElementwiseKernelFunctor(int N, func_t f) : N_(N), f_(f) {}
ScatterGatherElementwiseKernelFunctor(
int N,
func_t f,
int work_group_size,
int thread_work_size)
: N_(N),
f_(f),
work_group_size_(work_group_size),
thread_work_size_(thread_work_size) {}

private:
int N_;
func_t f_;
int work_group_size_;
int thread_work_size_;
};

template <int nt, int vt, typename func_t>
template <typename func_t>
static void launch_scatter_gather_kernel(int64_t N, const func_t& f) {
TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
if (N == 0) {
return;
}

sycl::range<1> local_range{(size_t)nt};
int num_workgroups = (N + nt * vt - 1) / (nt * vt);
sycl::range<1> global_range{(size_t)(num_workgroups * nt)};

auto caller =
ScatterGatherElementwiseKernelFunctor<nt, vt, func_t>((int)N, f);
using KernelFn = ScatterGatherElementwiseKernelFunctor<func_t>;
int64_t max_wg_size = syclMaxWorkGroupSize<KernelFn>();
int outputSize = N;
int work_group_size = outputSize > max_wg_size ? max_wg_size : outputSize;
const auto target_global_size = syclMaxWorkItemsPerTile();
// Each work group size is work_group_size, one full device launch is
// target_global_size, so we can calculate max work group num as below
const int max_work_group_num = target_global_size / work_group_size;
int work_group_num = outputSize / work_group_size < max_work_group_num
? outputSize / work_group_size
: max_work_group_num;
int draft_work_group_num =
(outputSize + work_group_size - 1) / work_group_size;

int thread_work_size = draft_work_group_num / work_group_num + 1;

sycl::range<1> local_range(work_group_size);
sycl::range<1> global_range(work_group_num * work_group_size);

auto caller = KernelFn((int)N, f, work_group_size, thread_work_size);
sycl_kernel_submit(
global_range, local_range, at::xpu::getCurrentSYCLQueue(), caller);
}
Expand Down Expand Up @@ -268,11 +291,7 @@ struct ScatterGatherInternalKernel {
numel,
f);

// TODO: optimize it
constexpr int group_work_items = 256;
constexpr int work_size_per_item = 4;
launch_scatter_gather_kernel<group_work_items, work_size_per_item>(
iter.numel(), loop);
launch_scatter_gather_kernel(iter.numel(), loop);
}
};

Expand Down Expand Up @@ -521,11 +540,7 @@ struct ScatterFillInternalKernel {
decltype(offset_calc),
func_t>(self_ptr, index_ptr, offset_calc, index_stride, f, src_val);

// TODO: optimize it
constexpr int group_work_items = 256;
constexpr int work_size_per_item = 4;
launch_scatter_gather_kernel<group_work_items, work_size_per_item>(
iter.numel(), loop);
launch_scatter_gather_kernel(iter.numel(), loop);
}
};

Expand Down
83 changes: 75 additions & 8 deletions src/BuildOnWindows.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,6 @@
set(TORCH_XPU_OPS_LIBRARIES)
set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)

# Walk around cyclic dependence
# libtorch_xpu.so links to libtorch_xpu_ops.a
# Load libtorch_xpu_ops_aten.so explicitly by torch/__init__.py:_load_dll_libraries (Break cycle)
# libtorch_xpu_ops_aten.so links to libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so
# libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so links to libtorch_xpu.so
add_library(
torch_xpu_ops
STATIC
Expand All @@ -21,7 +16,6 @@ add_library(
${ATen_XPU_NATIVE_CPP_SRCS}
${ATen_XPU_GEN_SRCS})
install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
# target_compile_definitions(torch_xpu_ops_aten PRIVATE CAFFE2_BUILD_MAIN_LIB)
target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
Expand All @@ -48,8 +42,11 @@ else()
set(ATen_XPU_SYCL_REDUCE_SRCS)
set(ATen_XPU_SYCL_ACTIVATION_SRCS)
set(ATen_XPU_SYCL_FOREACH_SRCS)
set(ATen_XPU_SYCL_TENSOR_SRCS)
set(ATen_XPU_SYCL_NORM_LOSS_SRCS)
set(ATen_XPU_SYCL_POLY_SRCS)
set(ATen_XPU_SYCL_DISTRIBUTION_SRCS)
set(ATen_XPU_SYCL_OTHERS_SRCS)

foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
Expand All @@ -63,6 +60,13 @@ else()
string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})
string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
string(REGEX MATCH "Tensor" IS_TENSOR ${sycl_src})
string(REGEX MATCH "Norm" IS_NORM ${sycl_src})
string(REGEX MATCH "Loss" IS_LOSS ${sycl_src})
string(REGEX MATCH "Polynomial" IS_POLY ${sycl_src})
#Move resize kernel to Norm and Loss lib, to resolve symbol.
string(REGEX MATCH "Resize" IS_RESIZE ${sycl_src})
string(REGEX MATCH "Distribution" IS_DISTRIBUTION ${sycl_src})

if(NOT IS_FOREACH STREQUAL "")
list(APPEND ATen_XPU_SYCL_FOREACH_SRCS ${sycl_src})
Expand All @@ -74,11 +78,18 @@ else()
list(APPEND ATen_XPU_SYCL_REDUCE_SRCS ${sycl_src})
elseif(NOT IS_ACTIVATION STREQUAL "")
list(APPEND ATen_XPU_SYCL_ACTIVATION_SRCS ${sycl_src})
elseif(NOT IS_TENSOR STREQUAL "")
list(APPEND ATen_XPU_SYCL_TENSOR_SRCS ${sycl_src})
elseif(NOT IS_DISTRIBUTION STREQUAL "")
list(APPEND ATen_XPU_SYCL_DISTRIBUTION_SRCS ${sycl_src})
elseif(NOT IS_NORM STREQUAL "" OR NOT IS_LOSS STREQUAL "" OR NOT IS_RESIZE STREQUAL "")
list(APPEND ATen_XPU_SYCL_NORM_LOSS_SRCS ${sycl_src})
elseif(NOT IS_POLY STREQUAL "")
list(APPEND ATen_XPU_SYCL_POLY_SRCS ${sycl_src})
else()
list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
endif()
endforeach()

# Binary kernel lib
set(sycl_binary_lib torch_xpu_ops_sycl_binary_kernels)
sycl_add_library(
Expand Down Expand Up @@ -148,7 +159,63 @@ else()

# Decouple with PyTorch cmake definition.
install(TARGETS ${sycl_foreach_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")

# Tensor kernel lib
set(sycl_tensor_lib torch_xpu_ops_sycl_tensor_kernels)
sycl_add_library(
${sycl_tensor_lib}
SHARED
SYCL_SOURCES ${ATen_XPU_SYCL_TENSOR_SRCS})
target_compile_definitions(${sycl_tensor_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_tensor_lib})
target_link_libraries(${sycl_tensor_lib} PUBLIC torch_xpu)
list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_tensor_lib})

# Decouple with PyTorch cmake definition.
install(TARGETS ${sycl_tensor_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")

# Norm and Loss kernel lib
set(sycl_norm_loss_lib torch_xpu_ops_sycl_norm_loss_kernels)
sycl_add_library(
${sycl_norm_loss_lib}
SHARED
SYCL_SOURCES ${ATen_XPU_SYCL_NORM_LOSS_SRCS})
target_compile_definitions(${sycl_norm_loss_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_norm_loss_lib})
target_link_libraries(${sycl_norm_loss_lib} PUBLIC torch_xpu)
list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_norm_loss_lib})

# Decouple with PyTorch cmake definition.
install(TARGETS ${sycl_norm_loss_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")

# Polynomial kernel lib
set(sycl_poly_lib torch_xpu_ops_sycl_poly_kernels)
sycl_add_library(
${sycl_poly_lib}
SHARED
SYCL_SOURCES ${ATen_XPU_SYCL_POLY_SRCS})
target_compile_definitions(${sycl_poly_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_poly_lib})
target_link_libraries(${sycl_poly_lib} PUBLIC torch_xpu)
list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_poly_lib})

# Decouple with PyTorch cmake definition.
install(TARGETS ${sycl_poly_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")

# Distribution kernel lib
set(sycl_dist_lib torch_xpu_ops_sycl_dist_kernels)
sycl_add_library(
${sycl_dist_lib}
SHARED
SYCL_SOURCES ${ATen_XPU_SYCL_DISTRIBUTION_SRCS})
target_compile_definitions(${sycl_dist_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_dist_lib})
target_link_libraries(${sycl_dist_lib} PUBLIC torch_xpu)
list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_dist_lib})

# Decouple with PyTorch cmake definition.
install(TARGETS ${sycl_dist_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")

# Other kernel lib
set(sycl_lib torch_xpu_ops_sycl_kernels)
sycl_add_library(
Expand Down
2 changes: 1 addition & 1 deletion test/xpu/test_binary_ufuncs_xpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def to_np(value):
else:
self.assertRaisesRegex(
RuntimeError,
"Found dtype \\w+ but expected \\w+",
r"result type \w+ can't be cast to the desired output type \w+",
lambda: actual.pow_(exponent),
)

Expand Down
Loading