Merge branch 'main' into chao/fp64emu

intel · Jul 30, 2024 · fc75a4e · fc75a4e
2 parents b59d675 + 8a821bf
commit fc75a4e
Show file tree

Hide file tree

Showing 28 changed files with 1,588 additions and 140 deletions.
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
@@ -2,8 +2,10 @@ name: Nightly-OnDemand Tests
 
 on:
   schedule:
-    # GMT+8 21:00 every day
-    - cron: '0 13 * * *'
+    # GMT+8 21:00 every workday
+    - cron: '0 13 * * 0-4'
+    # GMT+8 0:00 Saturday
+    - cron: '0 16 * * 5'
   workflow_dispatch:
     inputs:
       pytorch:
@@ -78,7 +80,7 @@ jobs:
     runs-on: pvc_e2e
     # Don't run on forked repos
     if: github.repository_owner == 'intel'
-    timeout-minutes: 900
+    timeout-minutes: 3600
     env:
       pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
       keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
@@ -174,8 +176,10 @@ jobs:
           echo "$GITHUB_ENV"
           rm -rf ../pytorch/inductor_log
           rm -rf /tmp/torchinductor_*
+
+      # Nihglty launch
       - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4'
         uses: ./.github/actions/inductor-xpu-e2e-test
         with:
           suite: huggingface
@@ -185,7 +189,7 @@ jobs:
           scenario: accuracy
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: Nightly Torchbench BF16 Training Accuracy Test
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4'
         uses: ./.github/actions/inductor-xpu-e2e-test
         with:
           suite: torchbench
@@ -195,7 +199,7 @@ jobs:
           env_prepare: true
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: Nightly Timm_models FP16 Training Accuracy Test
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4'
         uses: ./.github/actions/inductor-xpu-e2e-test
         with:
           suite: timm_models
@@ -204,6 +208,38 @@ jobs:
           scenario: accuracy
           env_prepare: true
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      # Weekly launch
+      - name: Weekly Huggingface Full Test
+        if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
+        uses: ./.github/actions/inductor-xpu-e2e-test
+        with:
+          suite: huggingface
+          env_prepare: true
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
+          mode: inference,training
+          scenario: accuracy,performance
+          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      - name: Weekly Torchbench Full Test
+        if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
+        uses: ./.github/actions/inductor-xpu-e2e-test
+        with:
+          suite: torchbench
+          env_prepare: true
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
+          mode: inference,training
+          scenario: accuracy,performance
+          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      - name: Weekly Timm_models Full Test
+        if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
+        uses: ./.github/actions/inductor-xpu-e2e-test
+        with:
+          suite: timm_models
+          env_prepare: true
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
+          mode: inference,training
+          scenario: accuracy,performance
+          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      # On-demand launch
       - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
         if: github.event_name != 'schedule'
         uses: ./.github/actions/inductor-xpu-e2e-test
@@ -216,7 +252,7 @@ jobs:
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: Summarize archieve files
         id: summary
-        if: always()
+        if: ${{ ! cancelled() }}
         run: |
           rm -rf ${{ github.workspace }}/upload_files
           cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
@@ -237,14 +273,14 @@ jobs:
             exit 1
           fi
       - name: Upload Inductor XPU E2E Data
-        if: always()
+        if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
           name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
           path: ${{ github.workspace }}/upload_files
-      
+
   Tests-Failure-And-Report:
-    if: always()
+    if: ${{ ! cancelled() }}
     runs-on: pvc_e2e
     permissions:
       issues: write
@@ -288,6 +324,9 @@ jobs:
             test_type="On-demand"
             test_issue_id=426
             cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}"
+          elif [ "${{ github.event.schedule }}" == "0 16 * * 5" ];then
+            test_type="Weekly"
+            test_issue_id=432
           else
             test_type="Nightly"
             test_issue_id=432

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -127,7 +127,7 @@ jobs:
           env_prepare: true
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: Summarize archieve files
-        if: always()
+        if: ${{ ! cancelled() }}
         run: |
           rm -rf ${{ github.workspace }}/upload_files
           cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
@@ -137,7 +137,7 @@ jobs:
             exit 1
           fi
       - name: Upload Inductor XPU E2E Data
-        if: always()
+        if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
           name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}

diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp
@@ -45,6 +45,36 @@ Tensor XPUNativeFunctions::index_select(
   return index_select_out(self, dim, index, out);
 }
 
+Tensor& XPUNativeFunctions::masked_scatter_(
+    Tensor& self,
+    const Tensor& mask,
+    const Tensor& source) {
+  at::assert_no_internal_overlap(self);
+  TORCH_CHECK(
+      self.scalar_type() == source.scalar_type(),
+      "masked_scatter_: expected self and source to have same dtypes but got ",
+      self.scalar_type(),
+      " and ",
+      source.scalar_type());
+  TORCH_CHECK(
+      mask.dtype() == ScalarType::Bool,
+      "masked_scatter_ only supports boolean masks, "
+      "but got mask with dtype ",
+      mask.dtype());
+
+  c10::MaybeOwned<Tensor> b_mask =
+      expand_inplace(self, mask, "masked_scatter_");
+
+  if (self.numel() == 0) {
+    return self;
+  }
+
+  auto maskPrefixSum = at::empty(self.sizes(), mask.options().dtype(kLong));
+  native::xpu::masked_scatter_kernel(self, *b_mask, maskPrefixSum, source);
+
+  return self;
+}
+
 static Tensor& masked_select_out_impl(
     Tensor& result,
     const Tensor& self,

diff --git a/src/ATen/native/xpu/Loss.cpp b/src/ATen/native/xpu/Loss.cpp
@@ -80,6 +80,64 @@ Tensor& XPUNativeFunctions::mse_loss_backward_out(
   return grad_input;
 }
 
+
+Tensor& XPUNativeFunctions::smooth_l1_loss_out(
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    double beta,
+    Tensor& result) {
+  if (reduction != Reduction::None) {
+    TORCH_INTERNAL_ASSERT(
+        reduction == Reduction::Mean || reduction == Reduction::Sum);
+    result.resize_({});
+    Tensor loss;
+    auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
+    native::xpu::smooth_l1_kernel(iter, beta);
+    if (reduction == Reduction::Mean) {
+      at::mean_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
+    } else {
+      at::sum_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
+    }
+  } else {
+    auto iter = TensorIterator::borrowing_binary_op(result, input, target);
+    native::xpu::smooth_l1_kernel(iter, beta);
+  }
+  return result;
+}
+
+Tensor XPUNativeFunctions::smooth_l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    double beta) {
+  Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  result = XPUNativeFunctions::smooth_l1_loss_out(
+      input, target, reduction, beta, result);
+  return result;
+}
+
+Tensor& XPUNativeFunctions::smooth_l1_loss_backward_out(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    double beta,
+    Tensor& grad_input) {
+  auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.;
+  auto iter = at::TensorIteratorConfig()
+                  .add_output(grad_input)
+                  .add_const_input(input)
+                  .add_const_input(target)
+                  .add_const_input(grad_output)
+                  .promote_inputs_to_common_dtype(true)
+                  .cast_common_dtype_to_outputs(true)
+                  .enforce_safe_casting_to_output(true)
+                  .build();
+  native::xpu::smooth_l1_backward_kernel(iter, norm, beta);
+  return grad_input;
+}
+
 Tensor XPUNativeFunctions::binary_cross_entropy(
     const Tensor& self,
     const Tensor& target,

diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp
@@ -151,6 +151,21 @@ Tensor& XPUNativeFunctions::complex_out(
   return result;
 }
 
+Tensor& XPUNativeFunctions::polar_out(
+    const Tensor& abs,
+    const Tensor& angle,
+    Tensor& result) {
+  complex_check_dtype(result, abs, angle);
+  auto iter = TensorIteratorConfig()
+                  .add_output(result)
+                  .add_const_input(abs)
+                  .add_const_input(angle)
+                  .check_all_same_dtype(false)
+                  .build();
+  native::xpu::polar_kernel(iter);
+  return result;
+}
+
 Tensor& XPUNativeFunctions::randperm_out(
     int64_t n,
     c10::optional<Generator> generator,

diff --git a/src/ATen/native/xpu/WeightNorm.cpp b/src/ATen/native/xpu/WeightNorm.cpp
@@ -0,0 +1,27 @@
+#include <ATen/native/xpu/sycl/WeightNormKernels.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+namespace at {
+std::tuple<Tensor, Tensor> XPUNativeFunctions::_weight_norm_interface(
+    const Tensor& v,
+    const Tensor& g,
+    int64_t dim) {
+  return native::xpu::weight_norm_kernel(v, g, dim);
+}
+
+std::tuple<Tensor, Tensor> XPUNativeFunctions::_weight_norm_interface_backward(
+    const Tensor& grad_w,
+    const Tensor& saved_v,
+    const Tensor& saved_g,
+    const Tensor& saved_norms,
+    int64_t dim) {
+  TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
+  TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
+  TORCH_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
+  TORCH_CHECK(
+      dim == 0 || dim == saved_v.dim() - 1,
+      "fused kernels can only be applied for first or last dim")
+
+  return native::xpu::weight_norm_backward_kernel(
+      grad_w, saved_v, saved_g, saved_norms, dim);
+}
+} // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
@@ -224,7 +224,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "log_normal_",
     "logspace.out",
     "lu_unpack.out",
-    "masked_scatter_",
     "max_pool3d_with_indices",
     "max_pool3d_with_indices_backward",
     "max_unpool2d",
@@ -240,7 +239,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "ormqr",
     "_pdist_backward",
     "_pdist_forward",
-    "polar.out",
     "_prelu_kernel",
     "_prelu_kernel_backward",
     "prod",
@@ -257,8 +255,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "signbit.out",
     "sign.out",
     "sinc.out",
-    "smooth_l1_loss_backward.grad_input",
-    "smooth_l1_loss.out",
     "special_airy_ai.out",
     "special_bessel_j0.out",
     "special_bessel_j1.out",

diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
@@ -124,8 +124,8 @@ void launch_adaptive_max_pool2d_kernel(
   using KernelClass = AdaptiveMaxPool2dKernelFunctor<scalar_t, index_t>;
 
   int64_t output_size = batch * plane * osizeH * osizeW;
-  BatchKernelConfig cfg = {
-      1, output_size, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      1, output_size, 1, 1, true, BatchKernelConfig::Policy::pAdaptive);
 
   cfg.build<KernelClass>();
 
@@ -301,8 +301,8 @@ void launch_adaptive_max_pool2d_backward_kernel(
     int64_t sizeP) {
   using KernelClass = AdaptiveMaxPool2dBackwardKernelFunctor<scalar_t, index_t>;
 
-  BatchKernelConfig cfg = {
-      1, osize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      1, osize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive);
 
   cfg.build<KernelClass>();