Skip to content

Commit

Permalink
Merge branch 'main' into jianyi/safe_softmax
Browse files Browse the repository at this point in the history
  • Loading branch information
xytintel authored Dec 23, 2024
2 parents a3cee6d + 7137aeb commit 4ec9145
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 6 deletions.
2 changes: 2 additions & 0 deletions .github/scripts/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ if [ "$1" != "nightly_wheel" ];then
source /opt/intel/oneapi/compiler/latest/env/vars.sh
source /opt/intel/oneapi/umf/latest/env/vars.sh
source /opt/intel/oneapi/pti/latest/env/vars.sh
source /opt/intel/oneapi/ccl/latest/env/vars.sh
source /opt/intel/oneapi/mpi/latest/env/vars.sh
else
echo "Don't need to source DL-Essential for nightly wheel"
fi
50 changes: 47 additions & 3 deletions .github/workflows/_linux_transformers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ jobs:
DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
python: ${{ inputs.python != '' && inputs.python || '3.10' }}
pytorch: ${{ inputs.pytorch != '' && inputs.pytorch || 'nightly' }}
transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }}
TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
steps:
- name: Checkout torch-xpu-ops
Expand All @@ -60,7 +61,7 @@ jobs:
uses: actions/checkout@v4
with:
repository: huggingface/transformers
ref: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }}
ref: ${{ env.transformers }}
path: transformers
- name: Prepare OS environment
run: |
Expand Down Expand Up @@ -106,12 +107,54 @@ jobs:
id: installed
run: |
source activate huggingface_transformers_test
echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "pip installed packages:"
pip list | tee ${{ github.workspace }}/transformers/tests_log/pip_list.txt
echo "lspci gpu devices:"
lspci -d ::0380 | tee ${{ github.workspace }}/transformers/tests_log/lspci_0380.txt
echo "GPU render nodes:"
cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt
# printing annotations for the key packages
echo "### Annotations" >> $GITHUB_STEP_SUMMARY
echo "| | |" >> $GITHUB_STEP_SUMMARY
echo "| --- | --- |" >> $GITHUB_STEP_SUMMARY
echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |" >> $GITHUB_STEP_SUMMARY
echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |" >> $GITHUB_STEP_SUMMARY
echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ') |" >> $GITHUB_STEP_SUMMARY
packages=" \
level-zero \
libigc1 \
libigc2 \
libze1 \
libze-intel-gpu1 \
intel-i915-dkms \
intel-level-zero-gpu \
intel-opencl-icd"
for package in $packages; do
package_version=$(dpkg -l | grep $package | grep ii | head -1 | sed "s/ */ /g" | cut -f3 -d" ")
echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" >> $GITHUB_STEP_SUMMARY
done
packages="accelerate \
numpy \
torch \
torchaudio \
torchvision \
transformers"
for package in $packages; do
package_version=$(python -c "import $package; print($package.__version__)")
echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" >> $GITHUB_STEP_SUMMARY
done
# printing annotations for GPU cards
var="[$(cat /sys/class/drm/render*/device/vendor)]"
echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed "s/ /,/g") |" >> $GITHUB_STEP_SUMMARY
var="[$(cat /sys/class/drm/render*/device/device)]"
echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed "s/ /,/g") |" >> $GITHUB_STEP_SUMMARY
var=$(python -c "import torch; print(torch.version.xpu)")
echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |" >> $GITHUB_STEP_SUMMARY
var=$(python -c "import torch; print(torch.xpu.device_count())")
echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |" >> $GITHUB_STEP_SUMMARY
# printing annotations with key environment variables
echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |" >> $GITHUB_STEP_SUMMARY
echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |" >> $GITHUB_STEP_SUMMARY
- name: Sanitry check installed packages
run: |
source activate huggingface_transformers_test
Expand All @@ -120,6 +163,7 @@ jobs:
pip show torch | grep Version | grep xpu
pip show torchaudio | grep Version | grep xpu
pip show torchvision | grep Version | grep xpu
python -c 'import torch; exit(not torch.xpu.is_available())'
- name: Run XPU backbone
run: |
source activate huggingface_transformers_test
Expand Down
3 changes: 2 additions & 1 deletion src/ATen/native/xpu/sycl/ResizeKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ void resize_bytes_xpu(StorageImpl* storage, size_t size_bytes) {
c10::xpu::XPUGuard guard(device.index());
at::DataPtr data = allocator->allocate(size_bytes);
if (storage->data_ptr()) {
auto q = at::xpu::getCurrentSYCLQueue();
at::globalContext().lazyInitDevice(c10::DeviceType::XPU);

auto q = at::xpu::getCurrentSYCLQueue();
q.memcpy(
data.get(), storage->data(), std::min(storage->nbytes(), size_bytes));
}
Expand Down
1 change: 1 addition & 0 deletions src/ATen/xpu/EmptyTensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ TensorBase empty_strided_xpu(
IntArrayRef stride,
ScalarType dtype,
c10::optional<Device> device_opt) {
at::globalContext().lazyInitDevice(c10::DeviceType::XPU);
const auto device = device_or_default(device_opt);
TORCH_INTERNAL_ASSERT(device.is_xpu());
const c10::DeviceGuard device_guard(device);
Expand Down
16 changes: 16 additions & 0 deletions test/xpu/extended/skip_list_arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,21 @@
"test_compare_cpu_bincount_xpu_int64",
"test_compare_cpu_bincount_xpu_int8",
"test_compare_cpu_bincount_xpu_uint8",
# RuntimeError: Kernel is incompatible with all devices in devs
# https://github.com/intel/torch-xpu-ops/issues/1150
"test_compare_cpu_logcumsumexp_xpu_float16",
"test_compare_cpu_logcumsumexp_xpu_float32",
"test_compare_cpu_nn_functional_pdist_xpu_float32",
"test_compare_cpu_tril_indices_xpu_int32",
"test_compare_cpu_tril_indices_xpu_int64",
"test_compare_cpu_triu_indices_xpu_int32",
"test_compare_cpu_triu_indices_xpu_int64",
"test_backward_logcumsumexp_xpu_float32",
"test_backward_nn_functional_pdist_xpu_float32",
"test_forward_ad_logcumsumexp_xpu_float32",
"test_operator_logcumsumexp_xpu_float32",
"test_operator_nn_functional_pdist_xpu_float32",
"test_view_replay_logcumsumexp_xpu_float32",
"test_view_replay_nn_functional_pdist_xpu_float32",
),
}
4 changes: 4 additions & 0 deletions test/xpu/extended/skip_list_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,5 +194,9 @@
# Greatest absolute difference: 0.0625 at index (1,) (up to 0.001 allowed)
# Greatest relative difference: 0.00640869140625 at index (1,) (up to 0.001 allowed)
"test_compare_cpu_xlogy_xpu_bfloat16",
"test_compare_cpu_div_trunc_rounding_xpu_float64",
"test_compare_cpu_div_trunc_rounding_xpu_float16",
"test_compare_cpu_div_floor_rounding_xpu_float16",
"test_compare_cpu_div_floor_rounding_xpu_bfloat16",
),
}
16 changes: 15 additions & 1 deletion test/xpu/skip_list_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,14 @@
"test_python_ref__refs_square_xpu_complex64",
"test_python_ref_torch_fallback__refs_square_xpu_complex64",
"test_python_ref_torch_fallback__refs_exp_xpu_complex128",

# Failed on rolling driver, passed on preci
"test_python_ref__refs_div_trunc_rounding_xpu_float64",
"test_python_ref_executor__refs_div_trunc_rounding_executor_aten_xpu_float64",
"test_python_ref_torch_fallback__refs_div_trunc_rounding_xpu_float64",

# TODO: passed from source code building version, investigate
"test_python_ref__refs_log2_xpu_complex128",
),

"test_binary_ufuncs_xpu.py": (
Expand Down Expand Up @@ -1136,6 +1144,7 @@
# Greatest relative difference: 1.9145216356264427e-05 at index (463, 204) (up to 1.3e-06 allowed)
"test_reference_numerics_normal__refs_asinh_xpu_complex64",
"test_reference_numerics_normal_asinh_xpu_complex64",
"test_batch_vs_slicing__refs_sigmoid_xpu_complex128",
# Unexpected success: CUDA uses thrust::sqrt and has accuracy issue. XPU use std::sqrt and has no issue.
"test_reference_numerics_large_rsqrt_xpu_complex32",
# Numeric difference
Expand Down Expand Up @@ -1514,6 +1523,8 @@
# XPU does not support tunable.
"test_bmm_tunableop_rocm_xpu_float32",
"test_numeric_check_leak_tunableop_rocm_xpu_float32",
"test_dump_results_on_exit_tunableop_xpu_float32",
"test_rotating_buffer_tunableop_xpu_float32",
# CUDA bias cases added in latest PyTorch
# AttributeError: module 'torch._C' has no attribute '_cuda_tunableop_enable'
"test_matmul_check_entries_tunableop_xpu_float16",
Expand Down Expand Up @@ -3230,7 +3241,10 @@

"test_type_promotion_xpu.py": None,

"test_distributions_xpu.py": None,
"test_distributions_xpu.py": (
# TODO: Passed on lts driver version, but failed on rolling driver version
"test_gamma_gpu_sample_xpu",
),

"test_optim_xpu.py": (
# oneDNN issues
Expand Down
35 changes: 34 additions & 1 deletion test/xpu/test_unary_ufuncs_xpu.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Owner(s): ["module: intel"]

from torch.testing._internal.common_device_type import instantiate_device_type_tests
import torch
from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyXPU
from torch.testing._internal.common_utils import run_tests

try:
Expand All @@ -11,6 +12,38 @@
with XPUPatchForImport(False):
from test_unary_ufuncs import TestUnaryUfuncs

@onlyXPU
def _nonzero_static_large(self, device):
# large enough to have multiple iters per SM even on H100
# with 132 sms
size_inp = 1024 * 16 * 132 + 1024 * 16
x = torch.zeros(size_inp, device=device)
# unique indices
indices = torch.randperm(size_inp, device=device)[: size_inp // 2]
sorted, _ = torch.sort(indices)
x[sorted] = 1
res = torch.nonzero_static(x, size=size_inp // 2).view(-1)
self.assertEqual(res, sorted)
# no oob writes
out = torch.full((size_inp,), 10, device=device, dtype=torch.int64)
res = torch.nonzero_static(x, size=size_inp // 4, out=out[: size_inp // 2])
self.assertEqual(out[: size_inp // 4], sorted[: size_inp // 4])
self.assertEqual(
out[size_inp // 4 :],
torch.tensor(10, device="xpu").expand_as(out[size_inp // 4 :]),
)
# correct fill for 2d
x = x.view(2, size_inp // 2)
ref = x.nonzero()
res = x.nonzero_static(size=size_inp // 2 + 2)
self.assertEqual(res.shape, [size_inp // 2 + 2, 2])
self.assertEqual(ref, res[: size_inp // 2])
self.assertEqual(
res[size_inp // 2 :],
torch.tensor(-1, device="xpu").expand_as(res[size_inp // 2 :]),
)
TestUnaryUfuncs.test_nonzero_static_large = _nonzero_static_large

instantiate_device_type_tests(TestUnaryUfuncs, globals(),only_for=("xpu"), allow_xpu=True)

if __name__ == "__main__":
Expand Down

0 comments on commit 4ec9145

Please sign in to comment.