Skip to content

Commit

Permalink
merge main (#1051)
Browse files Browse the repository at this point in the history
  • Loading branch information
chunhuanMeng authored Nov 6, 2024
2 parents 69ebbed + fda86cf commit 0c0c6cd
Show file tree
Hide file tree
Showing 101 changed files with 6,843 additions and 227 deletions.
4 changes: 0 additions & 4 deletions .github/scripts/apply_torch_pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@
default=[
# Fallback to CPU for XPU FP64
"https://github.com/pytorch/pytorch/pull/126516",
# Enable deterministic for mkldnn ops
"https://github.com/pytorch/pytorch/pull/127277",
# [Inductor][Intel GPU] Support reduction split.
"https://github.com/pytorch/pytorch/pull/129120",
# Modify the tolerance level in TIMM benchmark
"https://github.com/pytorch/pytorch/pull/129735",
# [Intel GPU] Allow XPU device in cdist and pdist operators
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/_linux_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
source activate xpu_op_${ZE_AFFINITY_MASK}
cd ../ && rm -rf pytorch
git clone https://github.com/pytorch/pytorch pytorch
cd pytorch && git checkout $(echo ${{ env.pytorch }} |sed 's/^nightly_wheel$/nightly/')
cd pytorch && git checkout $(echo ${{ inputs.pytorch }} |sed 's/^nightly_wheel$/nightly/')
# apply PRs for stock pytorch
pip install requests
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
Expand Down Expand Up @@ -125,6 +125,8 @@ jobs:
cd ..
python pytorch/torch/utils/collect_env.py
rm -rf /tmp/torchinductor_*
rm -rf ~/.triton/cache
- name: Run XPU OP Examples
if: contains(inputs.ut, 'op_regression') || github.event_name == 'schedule'
run: |
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nightly_ondemand.yml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ jobs:
echo "$GITHUB_ENV"
rm -rf ../pytorch/inductor_log
rm -rf /tmp/torchinductor_*
rm -rf ~/.triton/cache
# Nihglty launch
- name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test
Expand Down
3 changes: 2 additions & 1 deletion cmake/BuildFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
if(USE_PER_OPERATOR_HEADERS)
list(APPEND SYCL_HOST_FLAGS -DAT_PER_OPERATOR_HEADERS)
endif()

list(APPEND SYCL_HOST_FLAGS -D__INTEL_LLVM_COMPILER_VERSION=${__INTEL_LLVM_COMPILER})
# -- Kernel flags (SYCL_KERNEL_OPTIONS)
# The fast-math will be enabled by default in SYCL compiler.
# Refer to [https://clang.llvm.org/docs/UsersManual.html#cmdoption-fno-fast-math]
Expand Down Expand Up @@ -89,6 +89,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_PREVIEW_BREAKING_CHANGES)
set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI})
endif()
set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_LLVM_COMPILER_VERSION=${__INTEL_LLVM_COMPILER})

CHECK_SYCL_FLAG("-fsycl-fp64-conv-emu" SUPPORTS_FP64_CONV_EMU)
if(SUPPORTS_FP64_CONV_EMU)
Expand Down
57 changes: 46 additions & 11 deletions cmake/Modules/FindSYCLToolkit.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ endif()
if(SYCLTOOLKIT_FOUND)
return()
endif()

set(SYCLTOOLKIT_FOUND TRUE)

include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
Expand Down Expand Up @@ -77,7 +78,7 @@ endif()

# Function to write a test case to verify SYCL features.

function(SYCL_CMPLR_TEST_WRITE src)
function(SYCL_CMPLR_TEST_WRITE src macro_name)

set(cpp_macro_if "#if")
set(cpp_macro_endif "#endif")
Expand All @@ -88,8 +89,8 @@ function(SYCL_CMPLR_TEST_WRITE src)

# Feature tests goes here

string(APPEND SYCL_CMPLR_TEST_CONTENT "${cpp_macro_if} defined(SYCL_LANGUAGE_VERSION)\n")
string(APPEND SYCL_CMPLR_TEST_CONTENT "cout << \"SYCL_LANGUAGE_VERSION=\"<<SYCL_LANGUAGE_VERSION<<endl;\n")
string(APPEND SYCL_CMPLR_TEST_CONTENT "${cpp_macro_if} defined(${macro_name})\n")
string(APPEND SYCL_CMPLR_TEST_CONTENT "cout << \"${macro_name}=\"<<${macro_name}<<endl;\n")
string(APPEND SYCL_CMPLR_TEST_CONTENT "${cpp_macro_endif}\n")

string(APPEND SYCL_CMPLR_TEST_CONTENT "return 0;}\n")
Expand All @@ -103,6 +104,7 @@ endfunction()
function(SYCL_CMPLR_TEST_BUILD error TEST_SRC_FILE TEST_EXE)

set(SYCL_CXX_FLAGS_LIST "${SYCL_CXX_FLAGS}")
string(REPLACE "-Wno-stringop-overflow" "" SYCL_CXX_FLAGS_LIST "${SYCL_CXX_FLAGS_LIST}")
separate_arguments(SYCL_CXX_FLAGS_LIST)

execute_process(
Expand Down Expand Up @@ -150,19 +152,19 @@ function(SYCL_CMPLR_TEST_RUN error TEST_EXE)

endfunction()

function(SYCL_CMPLR_TEST_EXTRACT test_output)
function(SYCL_CMPLR_TEST_EXTRACT test_output macro_name)

string(REGEX REPLACE "\n" ";" test_output_list "${test_output}")

set(SYCL_LANGUAGE_VERSION "")
set(${macro_name} "")
foreach(strl ${test_output_list})
if(${strl} MATCHES "^SYCL_LANGUAGE_VERSION=([A-Za-z0-9_]+)$")
string(REGEX REPLACE "^SYCL_LANGUAGE_VERSION=" "" extracted_sycl_lang "${strl}")
set(SYCL_LANGUAGE_VERSION ${extracted_sycl_lang})
if(${strl} MATCHES "^${macro_name}=([A-Za-z0-9_]+)$")
string(REGEX REPLACE "^${macro_name}=" "" extracted_sycl_lang "${strl}")
set(${macro_name} ${extracted_sycl_lang})
endif()
endforeach()

set(SYCL_LANGUAGE_VERSION "${SYCL_LANGUAGE_VERSION}" PARENT_SCOPE)
set(${macro_name} "${extracted_sycl_lang}" PARENT_SCOPE)
endfunction()

set(SYCL_FLAGS "")
Expand All @@ -189,7 +191,7 @@ if(${has_werror} EQUAL -1)
# Create the test source file
set(TEST_SRC_FILE "${SYCL_CMPLR_TEST_DIR}/sycl_features.cpp")
set(TEST_EXE "${TEST_SRC_FILE}.exe")
SYCL_CMPLR_TEST_WRITE(${TEST_SRC_FILE})
SYCL_CMPLR_TEST_WRITE(${TEST_SRC_FILE} "SYCL_LANGUAGE_VERSION")

# Build the test and create test executable
SYCL_CMPLR_TEST_BUILD(error ${TEST_SRC_FILE} ${TEST_EXE})
Expand All @@ -204,7 +206,7 @@ if(${has_werror} EQUAL -1)
endif()

# Extract test output for information
SYCL_CMPLR_TEST_EXTRACT(${test_output})
SYCL_CMPLR_TEST_EXTRACT(${test_output} "SYCL_LANGUAGE_VERSION")

# As per specification, all the SYCL compatible compilers should
# define macro SYCL_LANGUAGE_VERSION
Expand All @@ -221,5 +223,38 @@ if(${has_werror} EQUAL -1)
set(SYCL_LANGUAGE_VERSION "${SYCL_LANGUAGE_VERSION}" CACHE STRING "SYCL Language version")
endif()

# Create a clean working directory.
set(SYCL_CMPLR_TEST_DIR "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/TESTSYCLCMPLR")
file(REMOVE_RECURSE ${SYCL_CMPLR_TEST_DIR})
file(MAKE_DIRECTORY ${SYCL_CMPLR_TEST_DIR})
# Create the test source file
set(TEST_SRC_FILE "${SYCL_CMPLR_TEST_DIR}/llvm_features.cpp")
set(TEST_EXE "${TEST_SRC_FILE}.exe")
SYCL_CMPLR_TEST_WRITE(${TEST_SRC_FILE} "__INTEL_LLVM_COMPILER")
# Build the test and create test executable
SYCL_CMPLR_TEST_BUILD(error ${TEST_SRC_FILE} ${TEST_EXE})
if(error)
message(FATAL_ERROR "Can not build SYCL_CMPLR_TEST")
endif()
# Execute the test to extract information
SYCL_CMPLR_TEST_RUN(error ${TEST_EXE})
if(error)
message(FATAL_ERROR "Can not run SYCL_CMPLR_TEST")
endif()
# Extract test output for information
SYCL_CMPLR_TEST_EXTRACT(${test_output} "__INTEL_LLVM_COMPILER")

# Check whether the value of __INTEL_LLVM_COMPILER macro was successfully extracted
string(COMPARE EQUAL "${__INTEL_LLVM_COMPILER}" "" nosycllang)
if(nosycllang)
set(SYCLTOOLKIT_FOUND False)
set(SYCL_REASON_FAILURE "Can not find __INTEL_LLVM_COMPILER}")
set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
endif()


# Include in Cache
set(__INTEL_LLVM_COMPILER "${__INTEL_LLVM_COMPILER}" CACHE STRING "Intel llvm compiler")

message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}")
message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}")
11 changes: 11 additions & 0 deletions src/ATen/native/xpu/AiryAi.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include <ATen/native/DispatchStub.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/UnaryOps.h>
#include <ATen/native/xpu/sycl/AiryAiKernel.h>

namespace at {
namespace native {
REGISTER_XPU_DISPATCH(special_airy_ai_stub, &xpu::airy_ai_kernel);

} // namespace native
} // namespace at
33 changes: 26 additions & 7 deletions src/ATen/native/xpu/Bessel.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include <ATen/native/UnaryOps.h>
#include <ATen/core/Tensor.h>
#include <ATen/native/DispatchStub.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/core/Tensor.h>
#include <ATen/native/UnaryOps.h>
#include <ATen/native/xpu/sycl/BesselJ0Kernel.h>
#include <ATen/native/xpu/sycl/BesselJ1Kernel.h>
#include <ATen/native/xpu/sycl/BesselY0Kernel.h>
Expand All @@ -10,6 +10,8 @@
#include <ATen/native/xpu/sycl/ModifiedBesselI1Kernel.h>
#include <ATen/native/xpu/sycl/ModifiedBesselK0Kernel.h>
#include <ATen/native/xpu/sycl/ModifiedBesselK1Kernel.h>
#include <ATen/native/xpu/sycl/ScaledModifiedBesselK0Kernel.h>
#include <ATen/native/xpu/sycl/ScaledModifiedBesselK1Kernel.h>
#include <ATen/native/xpu/sycl/SphericalBesselJ0Kernel.h>

namespace at {
Expand All @@ -18,10 +20,27 @@ REGISTER_XPU_DISPATCH(special_bessel_j0_stub, &xpu::bessel_j0_kernel);
REGISTER_XPU_DISPATCH(special_bessel_j1_stub, &xpu::bessel_j1_kernel);
REGISTER_XPU_DISPATCH(special_bessel_y0_stub, &xpu::bessel_y0_kernel);
REGISTER_XPU_DISPATCH(special_bessel_y1_stub, &xpu::bessel_y1_kernel);
REGISTER_XPU_DISPATCH(special_modified_bessel_i0_stub, &xpu::modified_bessel_i0_kernel);
REGISTER_XPU_DISPATCH(special_modified_bessel_i1_stub, &xpu::modified_bessel_i1_kernel);
REGISTER_XPU_DISPATCH(special_modified_bessel_k0_stub, &xpu::modified_bessel_k0_kernel);
REGISTER_XPU_DISPATCH(special_modified_bessel_k1_stub, &xpu::modified_bessel_k1_kernel);
REGISTER_XPU_DISPATCH(special_spherical_bessel_j0_stub, &xpu::spherical_bessel_j0_kernel);
REGISTER_XPU_DISPATCH(
special_modified_bessel_i0_stub,
&xpu::modified_bessel_i0_kernel);
REGISTER_XPU_DISPATCH(
special_modified_bessel_i1_stub,
&xpu::modified_bessel_i1_kernel);
REGISTER_XPU_DISPATCH(
special_modified_bessel_k0_stub,
&xpu::modified_bessel_k0_kernel);
REGISTER_XPU_DISPATCH(
special_modified_bessel_k1_stub,
&xpu::modified_bessel_k1_kernel);
REGISTER_XPU_DISPATCH(
special_spherical_bessel_j0_stub,
&xpu::spherical_bessel_j0_kernel);
REGISTER_XPU_DISPATCH(
special_scaled_modified_bessel_k0_stub,
&xpu::scaled_modified_bessel_k0_kernel);
REGISTER_XPU_DISPATCH(
special_scaled_modified_bessel_k1_stub,
&xpu::scaled_modified_bessel_k1_kernel);

} // namespace native
} // namespace at
11 changes: 9 additions & 2 deletions src/ATen/native/xpu/BinaryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,19 @@
#include <ATen/native/xpu/sycl/BinaryMiscOpsKernels.h>
#include <ATen/native/xpu/sycl/BinaryRemainderKernel.h>
#include <ATen/native/xpu/sycl/BinaryShiftOpsKernels.h>
#include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
#include <ATen/native/xpu/sycl/CopysignKernel.h>
#include <ATen/native/xpu/sycl/GcdLcmKernels.h>
#include <ATen/native/xpu/sycl/HermitePolynomialHKernel.h>
#include <ATen/native/xpu/sycl/HermitePolynomialHeKernel.h>
#include <ATen/native/xpu/sycl/IGammaKernel.h>
#include <ATen/native/xpu/sycl/LaguerrePolynomialLKernel.h>
#include <ATen/native/xpu/sycl/LegendrePolynomialPKernel.h>
#include <ATen/native/xpu/sycl/LogAddExpKernels.h>
#include <ATen/native/xpu/sycl/MaxMinElementwiseKernels.h>
#include <ATen/native/xpu/sycl/StepKernels.h>
#include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
#include <ATen/native/xpu/sycl/ShiftedChebyshevPolynomialKernels.h>
#include <ATen/native/xpu/sycl/StepKernels.h>
#include <ATen/native/xpu/sycl/ZetaKernel.h>

namespace at {
namespace native {
Expand All @@ -47,7 +49,10 @@ REGISTER_XPU_DISPATCH(maximum_stub, &xpu::maximum_kernel);
REGISTER_XPU_DISPATCH(minimum_stub, &xpu::minimum_kernel);
REGISTER_XPU_DISPATCH(sigmoid_backward_stub, &xpu::sigmoid_backward_kernel);
REGISTER_XPU_DISPATCH(nextafter_stub, &xpu::nextafter_kernel);
REGISTER_XPU_DISPATCH(heaviside_stub, &xpu::heaviside_kernel);
REGISTER_XPU_DISPATCH(hypot_stub, &xpu::hypot_kernel);
REGISTER_XPU_DISPATCH(igamma_stub, &xpu::igamma_kernel);
REGISTER_XPU_DISPATCH(igammac_stub, &xpu::igammac_kernel);
REGISTER_XPU_DISPATCH(atan2_stub, &xpu::atan2_kernel);
REGISTER_XPU_DISPATCH(copysign_stub, &xpu::copysign_kernel);
REGISTER_XPU_DISPATCH(logical_and_stub, &xpu::logical_and_kernel);
Expand All @@ -61,6 +66,8 @@ REGISTER_XPU_DISPATCH(fmin_stub, &xpu::fmin_kernel);
REGISTER_XPU_DISPATCH(lshift_stub, &xpu::lshift_kernel);
REGISTER_XPU_DISPATCH(rshift_stub, &xpu::rshift_kernel);
REGISTER_XPU_DISPATCH(xlogy_stub, &xpu::xlogy_kernel);
REGISTER_XPU_DISPATCH(xlog1py_stub, &xpu::xlog1py_kernel);
REGISTER_XPU_DISPATCH(zeta_stub, &xpu::zeta_kernel);
REGISTER_XPU_DISPATCH(
hermite_polynomial_h_stub,
&xpu::hermite_polynomial_h_kernel);
Expand Down
19 changes: 19 additions & 0 deletions src/ATen/native/xpu/Distributions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ Tensor _s_binomial_xpu(
return ret;
}

Tensor _s_gamma_xpu(const Tensor& alpha, c10::optional<Generator> gen_) {
auto gen = get_generator_or_default<at::XPUGeneratorImpl>(
gen_, at::xpu::detail::getDefaultXPUGenerator());
Tensor ret = at::empty(alpha.sizes(), alpha.options());
xpu::launch_gamma_kernel(ret, alpha, gen);
return ret;
}

Tensor _sample_dirichlet_xpu(
const Tensor& alpha,
std::optional<Generator> generator) {
Expand All @@ -74,6 +82,17 @@ Tensor _sample_dirichlet_xpu(
return ret;
}

Tensor _standard_gamma_grad_xpu(const Tensor& self, const Tensor& output) {
Tensor ret = at::empty(self.sizes(), self.options());
TensorIterator iter = TensorIteratorConfig()
.add_output(ret)
.add_input(self)
.add_input(output)
.build();
xpu::launch_standard_gamma_grad_kernel(iter);
return ret;
}

Tensor _dirichlet_grad_xpu(
const Tensor& x,
const Tensor& alpha,
Expand Down
1 change: 0 additions & 1 deletion src/ATen/native/xpu/Embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,5 @@ Tensor& embedding_renorm_xpu_(
self, indices, max_norm, norm_type);
}


} // namespace native
} // namespace at
22 changes: 22 additions & 0 deletions src/ATen/native/xpu/ForeachOpList.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@
#include <ATen/ops/_foreach_addcmul_native.h>
#include <ATen/ops/_foreach_clamp_max_native.h>
#include <ATen/ops/_foreach_clamp_min_native.h>
#include <ATen/ops/_foreach_copy_native.h>
#include <ATen/ops/_foreach_div_native.h>
#include <ATen/ops/_foreach_lerp_native.h>
#include <ATen/ops/_foreach_mul_native.h>
#include <ATen/ops/_foreach_pow_native.h>
#include <ATen/ops/_foreach_sub_native.h>

#include <ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h>
#include <ATen/native/xpu/sycl/ForeachCopyKernels.h>
#include <ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h>
#include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>

Expand Down Expand Up @@ -65,6 +68,7 @@ namespace native {
}

FOREACH_BINARY_OP_LIST_ALPHA(add);
FOREACH_BINARY_OP_LIST_ALPHA(sub);
FOREACH_BINARY_OP_LIST(mul, false);
FOREACH_BINARY_OP_LIST(div, true);
FOREACH_BINARY_OP_LIST(clamp_max, true);
Expand Down Expand Up @@ -147,5 +151,23 @@ void foreach_tensor_lerp_ternary_xpu_(
}
}

void foreach_tensor_copy_list_kernel_xpu_(
TensorList self,
TensorList src,
bool non_blocking) {
check_foreach_api_restrictions(self, src);
if (!can_use_fast_route(
self, src, /* does_op_promote_integer_inputs_to_float */ false)) {
return foreach_tensor_copy_list_kernel_slow_(self, src, non_blocking);
}

xpu::foreach_copy_list_kernel_(self, src);

// increment_version
for (const auto& t : self) {
t.unsafeGetTensorImpl()->bump_version();
}
}

} // namespace native
} // namespace at
Loading

0 comments on commit 0c0c6cd

Please sign in to comment.