diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml
index a539260a6..b21864e9b 100644
--- a/.github/workflows/_linux_transformers.yml
+++ b/.github/workflows/_linux_transformers.yml
@@ -51,6 +51,7 @@ jobs:
       python: ${{ inputs.python != '' && inputs.python || '3.10' }}
       pytorch: ${{ inputs.pytorch != '' && inputs.pytorch || 'nightly' }}
       transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }}
+      PYTORCH_DEBUG_XPU_FALLBACK: '1'
       TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
     steps:
       - name: Checkout torch-xpu-ops
@@ -269,6 +270,32 @@ jobs:
               echo "| $test_group | $file | $error | $comment |"
             done <_failures_uniq.txt
           } >> $GITHUB_STEP_SUMMARY
+      - name: Print not implemented XPU backend ops
+        run: |
+          cd transformers
+          {
+            echo "### Not implemented ops"
+            echo "| Test group | Operator | Status |"
+            echo "| --- | --- | --- |"
+            rm -rf _ops.txt && touch _ops.txt
+            for log in $(find reports -name failures_line.txt); do
+              # Each failure_line.txt is located in: reports/$test_group/failure_line.txt
+              test_group=$(echo $log | cut -f2 -d/)
+              ops=$(grep NotImplementedError $log | grep "for the XPU device" | sed "s/.*The operator '\(.*\)' is not.*/\1/")
+              for op in $ops; do
+                echo "| $test_group | <pre>$op</pre> | not implemented |" >> _ops.txt
+              done
+            done
+            for log in $(find reports -name warnings.txt); do
+              # Each warnings.txt is located in: reports/$test_group/warnings.txt
+              test_group=$(echo $log | cut -f2 -d/)
+              ops=$(grep UserWarning $log | grep "on the XPU backend" | sed "s/.*The operator '\(.*\) on the XPU.*/\1/")
+              for op in $ops; do
+                echo "| $test_group | <pre>$op</pre> | fallback to CPU happens |" >> _ops.txt
+              done
+            done
+            sort _ops.txt | uniq
+          } >> $GITHUB_STEP_SUMMARY
       - name: Print annotations
         if: ${{ ! cancelled() }}
         run: |
@@ -315,6 +342,8 @@ jobs:
             # printing annotations with key environment variables
             echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |"
             echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |"
+            echo "| jobs.$GITHUB_JOB.env.PYTORCH_ENABLE_XPU_FALLBACK | $PYTORCH_ENABLE_XPU_FALLBACK |"
+            echo "| jobs.$GITHUB_JOB.env.PYTORCH_DEBUG_XPU_FALLBACK | $PYTORCH_DEBUG_XPU_FALLBACK |"
           } >> $GITHUB_STEP_SUMMARY
       - name: Upload Test log
         if: ${{ ! cancelled() }}
diff --git a/src/ATen/native/xpu/SoftMax.cpp b/src/ATen/native/xpu/SoftMax.cpp
index e816d48c8..f155165ce 100644
--- a/src/ATen/native/xpu/SoftMax.cpp
+++ b/src/ATen/native/xpu/SoftMax.cpp
@@ -76,6 +76,17 @@ TORCH_IMPL_FUNC(log_softmax_xpu_out)
   xpu::_log_softmax_kernel(input, dim, half_to_float, output);
 }
 
+Tensor _safe_softmax_xpu(
+    const Tensor& self,
+    int64_t dim,
+    std::optional<ScalarType> dtype) {
+  // TODO: uncomment after XPU softmax support half_to_float=true
+  // if (self.scalar_type() == ScalarType::Half && dtype == ScalarType::Float)
+  //   return xpu::_safe_softmax_kernel(self, dim_, true);
+  Tensor converted = dtype.has_value() ? self.toType(dtype.value()) : self;
+  return xpu::_safe_softmax_kernel(converted, dim, false);
+}
+
 Tensor masked_softmax_xpu(
     const Tensor& input_,
     const Tensor& mask_,
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index 28d812f2c..0a0c7e718 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -210,7 +210,8 @@ template <
     int outer_loop,
     bool is_masked,
     typename calc_t,
-    typename vec_t>
+    typename vec_t,
+    bool is_safe_softmax>
 struct DispatchSoftmaxForwardKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
   [[intel::reqd_sub_group_size(SIMD)]] void operator()(
@@ -240,7 +241,8 @@ struct DispatchSoftmaxForwardKernelFunctor
       if (index >= dim_size_)
         break;
 
-      reg_in[i] = *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + index));
+      reg_in[i] =
+          *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + index));
       if constexpr (is_masked) {
         auto vec_offset = group_offset + index;
 #pragma unroll(vec_size)
@@ -309,6 +311,10 @@ struct DispatchSoftmaxForwardKernelFunctor
         if constexpr (LogSoftMax) {
           reg_in[i][j] =
               static_cast<scalar_t>(reg_in[i][j] - max_value - sum_value);
+        } else if (
+            is_safe_softmax &&
+            max_value == std::numeric_limits<accscalar_t>::lowest()) {
+          reg_in[i][j] = static_cast<scalar_t>(0);
         } else if (sum_value == 0) {
           reg_in[i][j] = nan_;
         } else {
@@ -386,7 +392,8 @@ template <
     bool LogSoftMax,
     int outer_loop,
     bool is_masked = false,
-    typename calc_t = decltype(nullptr)>
+    typename calc_t = decltype(nullptr),
+    bool is_safe_softmax = false>
 bool dispatch_softmax_forward_kernel(
     const scalar_t* in_data,
     scalar_t* out_data,
@@ -412,7 +419,8 @@ bool dispatch_softmax_forward_kernel(
         outer_loop,
         is_masked,
         calc_t,
-        vec_t>;
+        vec_t,
+        /*is_safe_softmax = */ false>;
 
     int sub_group_num, global_size_row, local_size_row, range, local_size;
     int max_group_size =
@@ -460,8 +468,8 @@ bool dispatch_softmax_forward_kernel(
         outer_loop,
         is_masked,
         DummyFunctor,
-        vec_t>;
-
+        vec_t,
+        is_safe_softmax>;
     int sub_group_num, global_size_row, local_size_row, range, local_size;
     int max_group_size =
         get_wgroup_size<SIMD, vec_size, outer_loop, KernelClass>(
@@ -506,7 +514,8 @@ template <
     typename IndexType,
     bool LogSoftMax,
     typename vec_t,
-    int align_bytes>
+    int align_bytes,
+    bool is_safe_softmax>
 struct SoftmaxForwardKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
     IndexType local_id = item.get_local_id(0);
@@ -562,6 +571,10 @@ struct SoftmaxForwardKernelFunctor {
             if (LogSoftMax)
               out_data_[group_offset + linear_idx] = static_cast<scalar_t>(
                   in_data_[group_offset + linear_idx] - max_value - sum_value);
+            else if (
+                is_safe_softmax &&
+                max_value == std::numeric_limits<accscalar_t>::lowest())
+              out_data_[group_offset + linear_idx] = static_cast<scalar_t>(0);
             else
               out_data_[group_offset + linear_idx] = static_cast<scalar_t>(
                   std::exp(in_data_[group_offset + linear_idx] - max_value) *
@@ -576,6 +589,10 @@ struct SoftmaxForwardKernelFunctor {
           if (LogSoftMax)
             in_val[j] =
                 static_cast<scalar_t>(in_val[j] - max_value - sum_value);
+          else if (
+              is_safe_softmax &&
+              max_value == std::numeric_limits<accscalar_t>::lowest())
+            in_val[j] = static_cast<scalar_t>(0);
           else
             in_val[j] = static_cast<scalar_t>(
                 std::exp(in_val[j] - max_value) * sum_value);
@@ -610,7 +627,8 @@ template <
     typename scalar_t,
     typename accscalar_t,
     typename IndexType,
-    bool LogSoftMax>
+    bool LogSoftMax,
+    bool is_safe_softmax>
 void softmax_forward_kernel(
     const scalar_t* in_data,
     scalar_t* out_data,
@@ -625,7 +643,8 @@ void softmax_forward_kernel(
       IndexType,
       LogSoftMax,
       vec_t,
-      align_bytes>;
+      align_bytes,
+      is_safe_softmax>;
 
   int local_size = std::min(
       (dim_size + vec_size - 1) / vec_size,
@@ -645,7 +664,8 @@ template <
     typename accscalar_t,
     typename IndexType,
     bool LogSoftMax,
-    typename vec_t>
+    typename vec_t,
+    bool is_safe_softmax>
 struct SpatialSoftmaxForwardKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
   void operator()(sycl::nd_item<3> item) const {
@@ -658,14 +678,16 @@ struct SpatialSoftmaxForwardKernelFunctor
     // get max value
     accscalar_t max_value[vec_size];
     auto offset = local_row_id * inner_size_ + global_col * vec_size;
-    vec_t value = *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
+    vec_t value =
+        *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
 #pragma unroll(vec_size)
     for (int j = 0; j < vec_size; ++j) {
       max_value[j] = accscalar_t(value[j]);
     }
     for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) {
       offset = i * inner_size_ + global_col * vec_size;
-      value = *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
+      value =
+          *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
 #pragma unroll(vec_size)
       for (int j = 0; j < vec_size; ++j) {
         max_value[j] = std::max(max_value[j], accscalar_t(value[j]));
@@ -695,7 +717,8 @@ struct SpatialSoftmaxForwardKernelFunctor
     }
     for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) {
       offset = i * inner_size_ + global_col * vec_size;
-      value = *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
+      value =
+          *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
 #pragma unroll(vec_size)
       for (int j = 0; j < vec_size; ++j) {
         sum_value[j] += std::exp(value[j] - max_value[j]);
@@ -736,6 +759,10 @@ struct SpatialSoftmaxForwardKernelFunctor
           if (LogSoftMax)
             in_val[j] =
                 static_cast<scalar_t>(in_val[j] - max_value[j] - sum_value[j]);
+          else if (
+              is_safe_softmax &&
+              max_value[j] == -std::numeric_limits<scalar_t>::infinity())
+            in_val[j] = static_cast<scalar_t>(0);
           else
             in_val[j] = static_cast<scalar_t>(
                 std::exp(in_val[j] - max_value[j]) * sum_value[j]);
@@ -787,7 +814,8 @@ template <
     typename scalar_t,
     typename accscalar_t,
     typename IndexType,
-    bool LogSoftMax>
+    bool LogSoftMax,
+    bool is_safe_softmax>
 void spatial_softmax_forward(
     const scalar_t* in_data,
     scalar_t* out_data,
@@ -801,7 +829,8 @@ void spatial_softmax_forward(
       accscalar_t,
       IndexType,
       LogSoftMax,
-      vec_t>;
+      vec_t,
+      is_safe_softmax>;
 
   int local_size, block_row;
   get_wgroup_size_spatial<vec_size, KernelClass>(
@@ -818,7 +847,8 @@ void spatial_softmax_forward(
       accscalar_t,
       IndexType,
       LogSoftMax,
-      vec_t>(
+      vec_t,
+      is_safe_softmax>(
       in_data,
       out_data,
       dim_size,
@@ -827,7 +857,6 @@ void spatial_softmax_forward(
       local_size,
       block_row,
       group_num);
-
   auto& queue = getCurrentSYCLQueue();
   sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
@@ -1387,7 +1416,11 @@ void spatial_softmax_backward_kernel(
   sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
-template <typename scalar_t, typename accscalar_t, bool LogSoftMax>
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    bool LogSoftMax,
+    bool is_safe_softmax>
 void spatial_softmax_forward(
     const Tensor& output,
     const Tensor& input,
@@ -1432,7 +1465,10 @@ void spatial_softmax_forward(
         accscalar_t,                                              \
         uint32_t,                                                 \
         LogSoftMax,                                               \
-        outer_loop>(                                              \
+        outer_loop,                                               \
+        /*is_masked = */ false,                                   \
+        /*calc_t = */ decltype(nullptr),                          \
+        /*is_safe_softmax = */ is_safe_softmax>(                  \
         input.const_data_ptr<scalar_t>(),                         \
         output.mutable_data_ptr<scalar_t>(),                      \
         dim_size,                                                 \
@@ -1446,7 +1482,8 @@ void spatial_softmax_forward(
         scalar_t,                                 \
         accscalar_t,                              \
         IndexType,                                \
-        LogSoftMax>(                              \
+        LogSoftMax,                               \
+        is_safe_softmax>(                         \
         input.const_data_ptr<scalar_t>(),         \
         output.mutable_data_ptr<scalar_t>(),      \
         dim_size,                                 \
@@ -1460,7 +1497,8 @@ void spatial_softmax_forward(
         scalar_t,                                         \
         accscalar_t,                                      \
         IndexType,                                        \
-        LogSoftMax>(                                      \
+        LogSoftMax,                                       \
+        is_safe_softmax>(                                 \
         input.const_data_ptr<scalar_t>(),                 \
         output.mutable_data_ptr<scalar_t>(),              \
         dim_size,                                         \
@@ -1749,7 +1787,8 @@ Tensor& masked_softmax_forward(
         LogSoftMax,                                                    \
         outer_loop,                                                    \
         true,                                                          \
-        decltype(input_calc)>(                                         \
+        decltype(input_calc),                                          \
+        /*is_safe_softmax = */ false>(                                 \
         input.const_data_ptr<scalar_t>(),                              \
         output.mutable_data_ptr<scalar_t>(),                           \
         dim_size,                                                      \
@@ -1922,7 +1961,7 @@ void masked_softmax_backward(
 #undef SIMD32
 } // namespace impl
 
-template <bool LogSoftMax>
+template <bool LogSoftMax, bool is_safe_softmax = false>
 void host_softmax(
     const Tensor& input_,
     const int64_t dim_,
@@ -1953,8 +1992,11 @@ void host_softmax(
         "host_softmax",
         [&] {
           using accscalar_t = acc_type_device<scalar_t, kXPU>;
-          impl::spatial_softmax_forward<scalar_t, accscalar_t, LogSoftMax>(
-              output, input, dim);
+          impl::spatial_softmax_forward<
+              scalar_t,
+              accscalar_t,
+              LogSoftMax,
+              is_safe_softmax>(output, input, dim);
         });
   }
   // return output;
@@ -2045,6 +2087,29 @@ void _log_softmax_backward_kernel(
       grad.contiguous(), output.contiguous(), dim, half_to_float, grad_input);
 }
 
+Tensor _safe_softmax_kernel(
+    const Tensor& self,
+    int64_t dim,
+    const bool half_to_float) {
+  auto output_options =
+      self.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  if (half_to_float) {
+    output_options = output_options.dtype(ScalarType::Float);
+  }
+  Tensor output = at::empty_like(self, output_options);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      self.scalar_type(),
+      "_safe_softmax",
+      [&] {
+        host_softmax<false, true>(
+            self.contiguous(), dim, half_to_float, output);
+      });
+
+  return output;
+}
+
 Tensor masked_softmax_kernel(
     const Tensor& input_,
     const Tensor& mask_,
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.h b/src/ATen/native/xpu/sycl/SoftMaxKernels.h
index 0fc08496b..fc26fec3e 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.h
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.h
@@ -32,6 +32,9 @@ TORCH_XPU_API void _log_softmax_backward_kernel(
     bool half_to_float,
     const Tensor& grad_input);
 
+TORCH_XPU_API Tensor
+_safe_softmax_kernel(const Tensor& self, int64_t dim, const bool half_to_float);
+
 TORCH_XPU_API Tensor masked_softmax_kernel(
     const Tensor& input_,
     const Tensor& mask_,
diff --git a/test/regressions/test_safe_softmax.py b/test/regressions/test_safe_softmax.py
new file mode 100644
index 000000000..7b390080a
--- /dev/null
+++ b/test/regressions/test_safe_softmax.py
@@ -0,0 +1,44 @@
+import torch
+from torch.testing._internal.common_utils import TestCase
+
+cpu_device = torch.device("cpu")
+xpu_device = torch.device("xpu")
+
+
+class TestSafeSoftMax(TestCase):
+    def test_sm(self):
+        for dtype in [torch.float, torch.float16, torch.bfloat16]:
+            x_cpu = torch.randn(128,128,128).to(dtype)
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, -1)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, -1)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+            x_cpu[0,0,:] = -float("inf")
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, -1)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, -1)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+
+            x_cpu = torch.randn(128,128,128).to(dtype)
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, 1)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, 1)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+            x_cpu[0,:,0] = -float("inf")
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, 1)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, 1)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+
+            x_cpu = torch.randn(128,128,128).to(dtype)
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, 0)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, 0)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+            x_cpu[:,0,0] = -float("inf")
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, 0)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, 0)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+
+
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index 52a93d91b..86f725744 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -2,6 +2,21 @@
     "test_ops_xpu.py": (
         # Skip list of base line
 
+        # XPU implementation doesn't claimn FP8 now
+        # https://github.com/intel/torch-xpu-ops/issues/461
+        "float8",
+
+        # workarounds for the following tests
+        # https://github.com/intel/torch-xpu-ops/issues/1214
+        "test_python_ref__refs_exp_xpu_complex128",
+        "test_python_ref__refs_sigmoid_xpu_complex128",
+        "test_python_ref_executor__refs_log2_executor_aten_xpu_complex128",
+        "test_python_ref_executor__refs_exp_executor_aten_xpu_complex128",
+        "test_python_ref_torch_fallback__refs_log2_xpu_complex128",
+        "test_python_ref_torch_fallback__refs_log10_xpu_complex128",
+        "test_python_ref_torch_fallback__refs_sigmoid_xpu_complex128",
+        "test_python_ref_executor__refs_log10_executor_aten_xpu_complex128",
+
         # To be removed from this file.
         # CUDA and XPU both XFAIL now.
         "test_out_narrow_copy_xpu_float32", 
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
index cbd57c762..7a257f0fd 100644
--- a/yaml/native/native_functions.yaml
+++ b/yaml/native/native_functions.yaml
@@ -2017,6 +2017,10 @@
   dispatch:
     XPU: softmax_xpu_out
 
+- func: _safe_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  dispatch:
+    XPU: _safe_softmax_xpu
+
 - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
   structured_delegate: _softmax_backward_data.out