coderfeli
diff --git a/‎aten/src/ATen/native/cuda/AmpKernels.cu
+15-18 b/‎aten/src/ATen/native/cuda/AmpKernels.cu
+15-18
diff --git a/‎aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+37-51 b/‎aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+37-51
diff --git a/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+18-24 b/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+18-24
diff --git a/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+19-25 b/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+19-25
diff --git a/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
+20-26 b/‎aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
+20-26
@@ -157,24 +157,21 @@ void _amp_foreach_non_finite_check_and_unscale_cuda_(TensorList scaled_grads,
       using opmath_t = at::opmath_type<scalar_t>;
 
       // multi_tensor_apply guards onto tensor_lists[0][0], no need to guard explicitly.
-      DISPATCH_MULTI_TENSOR_APPLY([&]() {
-        multi_tensor_apply<1>(tensor_lists,
-                              UnaryOpFunctor<scalar_t,
-                                             /* depth */ 1,
-                                             /* r_args_depth */ 1,
-                                             /* res_arg_index */ 0,
-                                             large_kernel_arg>(),
-                              [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (opmath_t val) -> opmath_t {
-                                // There is a slight asymmetry here with the TensorIterator kernel above.
-                                // MTA Functors ensure val comes in as opmath_t rather than scalar_t.
-                                if (!isfinite_ensure_cuda_math(val)) {
-                                  *found_inf_ptr = 1.f;
-                                }
-                                // Every thread accesses inv_scale, but it will hit in cache.
-                                const auto inv_scale_val = *inv_scale_ptr;
-                                return static_cast<opmath_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
-                              });
-      });
+      multi_tensor_apply<1>(tensor_lists,
+                            UnaryOpFunctor<scalar_t,
+                                           /* depth */ 1,
+                                           /* r_args_depth */ 1,
+                                           /* res_arg_index */ 0>(),
+                            [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (opmath_t val) -> opmath_t {
+                              // There is a slight asymmetry here with the TensorIterator kernel above.
+                              // MTA Functors ensure val comes in as opmath_t rather than scalar_t.
+                              if (!isfinite_ensure_cuda_math(val)) {
+                                *found_inf_ptr = 1.f;
+                              }
+                              // Every thread accesses inv_scale, but it will hit in cache.
+                              const auto inv_scale_val = *inv_scale_ptr;
+                              return static_cast<opmath_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
+                            });
     });
 }
 
 
@@ -41,18 +41,15 @@ std::vector<Tensor> foreach_tensor_list_op(
   tensor_lists.emplace_back(std::move(vec_res));
 
   using opmath_t = at::opmath_type<T>;
-  DISPATCH_MULTI_TENSOR_APPLY([&]() {
-    multi_tensor_apply<3>(
-        tensor_lists,
-        BinaryOpListAlphaFunctor<
-            T,
-            /* depth */ 3,
-            /* r_args_depth */ 2,
-            /* res_arg_index */ 2,
-            large_kernel_arg>(),
-        Op<opmath_t>(),
-        alpha.to<opmath_t>());
-  });
+  multi_tensor_apply<3>(
+      tensor_lists,
+      BinaryOpListAlphaFunctor<
+          T,
+          /* depth */ 3,
+          /* r_args_depth */ 2,
+          /* res_arg_index */ 2>(),
+      Op<opmath_t>(),
+      alpha.to<opmath_t>());
 
   return tensor_lists[2];
 }
@@ -67,18 +64,15 @@ void foreach_tensor_list_op_(
   tensor_lists.emplace_back(tensors2.vec());
 
   using opmath_t = at::opmath_type<T>;
-  DISPATCH_MULTI_TENSOR_APPLY([&]() {
-    multi_tensor_apply<2>(
-        tensor_lists,
-        BinaryOpListAlphaFunctor<
-            T,
-            /* depth */ 2,
-            /* r_args_depth */ 2,
-            /* res_arg_index */ 0,
-            large_kernel_arg>(),
-        Op<opmath_t>(),
-        alpha.to<opmath_t>());
-  });
+  multi_tensor_apply<2>(
+      tensor_lists,
+      BinaryOpListAlphaFunctor<
+          T,
+          /* depth */ 2,
+          /* r_args_depth */ 2,
+          /* res_arg_index */ 0>(),
+      Op<opmath_t>(),
+      alpha.to<opmath_t>());
   increment_version(tensors1);
 }
 
@@ -337,15 +331,13 @@ template <
     typename src_t,
     int depth,
     int r_args_depth,
-    int res_arg_index,
-    bool large_kernel_arg>
+    int res_arg_index>
 struct CopyFunctor {
-  static constexpr bool use_large_kernel_arg = large_kernel_arg;
   static_assert(depth == 2 && r_args_depth == 1 && res_arg_index == 1);
   template <typename Op>
   __device__ __forceinline__ void operator()(
       int chunk_size,
-      TensorListMetadata<depth, large_kernel_arg>& tl,
+      TensorListMetadata<depth>& tl,
       Op op) {
     const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
     const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
@@ -428,36 +420,30 @@ void foreach_tensor_copy_list_kernel_cuda_(
         using opmath_t = at::opmath_type<scalar_t>;
         AT_DISPATCH_SOURCE_TYPES(src[0].scalar_type(), "foreach_tensor_copy", [&] {
           if constexpr (std::is_same_v<scalar_t, src_t>) {
-            DISPATCH_MULTI_TENSOR_APPLY([&]() {
-              multi_tensor_apply<2>(
-                  tensor_lists,
-                  UnaryOpFunctor<
-                      scalar_t,
-                      /* depth */ 2,
-                      /* r_args_depth */ 1,
-                      /* res_arg_index */ 1,
-                      large_kernel_arg>(),
-                  Copy<opmath_t, opmath_t>());
-            });
+            multi_tensor_apply<2>(
+                tensor_lists,
+                UnaryOpFunctor<
+                    scalar_t,
+                    /* depth */ 2,
+                    /* r_args_depth */ 1,
+                    /* res_arg_index */ 1>(),
+                Copy<opmath_t, opmath_t>());
           } else {
             // Ref:
             // https://github.com/pytorch/pytorch/blob/656134c38f4737d13c3f43fc5c59470bc23c1d2f/aten/src/ATen/native/Copy.cpp#L299-L301
             if (!self[0].is_complex() && src[0].is_complex()) {
               TORCH_WARN_ONCE(
                   "Casting complex values to real discards the imaginary part");
             }
-            DISPATCH_MULTI_TENSOR_APPLY([&]() {
-              multi_tensor_apply<2>(
-                  tensor_lists,
-                  CopyFunctor<
-                      scalar_t,
-                      src_t,
-                      /* depth */ 2,
-                      /* r_args_depth */ 1,
-                      /* res_arg_index */ 1,
-                      large_kernel_arg>(),
-                  Copy<scalar_t, src_t>());
-            });
+            multi_tensor_apply<2>(
+                tensor_lists,
+                CopyFunctor<
+                    scalar_t,
+                    src_t,
+                    /* depth */ 2,
+                    /* r_args_depth */ 1,
+                    /* res_arg_index */ 1>(),
+                Copy<scalar_t, src_t>());
           }
         });
       });
 
@@ -36,18 +36,15 @@ std::vector<Tensor> foreach_binary_op(
   tensor_lists.emplace_back(std::move(vec_res));
 
   using opmath_t = at::opmath_type<T>;
-  DISPATCH_MULTI_TENSOR_APPLY([&]() {
-    multi_tensor_apply<2>(
-        tensor_lists,
-        BinaryOpScalarFunctor<
-            T,
-            /* depth */ 2,
-            /* r_args_depth */ 1,
-            /* res_arg_index */ 1,
-            large_kernel_arg>(),
-        Op<opmath_t>(),
-        scalar.to<opmath_t>());
-  });
+  multi_tensor_apply<2>(
+      tensor_lists,
+      BinaryOpScalarFunctor<
+          T,
+          /* depth */ 2,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 1>(),
+      Op<opmath_t>(),
+      scalar.to<opmath_t>());
   return tensor_lists[1];
 }
 
@@ -57,18 +54,15 @@ void foreach_binary_op_(TensorList tensors, const Scalar& scalar) {
   tensor_lists.emplace_back(tensors.vec());
 
   using opmath_t = at::opmath_type<T>;
-  DISPATCH_MULTI_TENSOR_APPLY([&]() {
-    multi_tensor_apply<1>(
-        tensor_lists,
-        BinaryOpScalarFunctor<
-            T,
-            /* depth */ 1,
-            /* r_args_depth */ 1,
-            /* res_arg_index */ 0,
-            large_kernel_arg>(),
-        Op<opmath_t>(),
-        scalar.to<opmath_t>());
-  });
+  multi_tensor_apply<1>(
+      tensor_lists,
+      BinaryOpScalarFunctor<
+          T,
+          /* depth */ 1,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 0>(),
+      Op<opmath_t>(),
+      scalar.to<opmath_t>());
   increment_version(tensors);
 }
 
 
@@ -36,19 +36,16 @@ std::vector<Tensor> foreach_binary_op(
   tensor_lists.emplace_back(vec_res);
 
   using opmath_t = at::opmath_type<T>;
-  DISPATCH_MULTI_TENSOR_APPLY([&]() {
-    multi_tensor_apply<2, opmath_t>(
-        tensor_lists,
-        scalars,
-        BinaryOpScalarListFunctor<
-            T,
-            /* depth */ 2,
-            /* r_args_depth */ 1,
-            /* res_arg_index */ 1,
-            large_kernel_arg>(),
-
-        Op<opmath_t>());
-  });
+  multi_tensor_apply<2, opmath_t>(
+      tensor_lists,
+      scalars,
+      BinaryOpScalarListFunctor<
+          T,
+          /* depth */ 2,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 1>(),
+
+      Op<opmath_t>());
   return tensor_lists[1];
 }
 
@@ -58,18 +55,15 @@ void foreach_binary_op_(TensorList tensors, at::ArrayRef<Scalar> scalars) {
   tensor_lists.emplace_back(tensors.vec());
 
   using opmath_t = at::opmath_type<T>;
-  DISPATCH_MULTI_TENSOR_APPLY([&]() {
-    multi_tensor_apply<1, opmath_t>(
-        tensor_lists,
-        scalars,
-        BinaryOpScalarListFunctor<
-            T,
-            /* depth */ 1,
-            /* r_args_depth */ 1,
-            /* res_arg_index */ 0,
-            large_kernel_arg>(),
-        Op<opmath_t>());
-  });
+  multi_tensor_apply<1, opmath_t>(
+      tensor_lists,
+      scalars,
+      BinaryOpScalarListFunctor<
+          T,
+          /* depth */ 1,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 0>(),
+      Op<opmath_t>());
   increment_version(tensors);
 }
 
 
@@ -46,19 +46,16 @@ std::vector<Tensor> foreach_binary_op(
   tensor_lists.emplace_back(std::move(vec_res));
 
   using opmath_t = at::opmath_type<T>;
-  DISPATCH_MULTI_TENSOR_APPLY([&]() {
-    multi_tensor_apply<2>(
-        tensor_lists,
-        BinaryOpScalarTensorFunctor<
-            T,
-            /* depth */ 2,
-            /* r_args_depth */ 1,
-            /* res_arg_index */ 1,
-            large_kernel_arg>(),
-        Op<opmath_t>(),
-        scalar.data_ptr<T>(),
-        alpha.to<opmath_t>());
-  });
+  multi_tensor_apply<2>(
+      tensor_lists,
+      BinaryOpScalarTensorFunctor<
+          T,
+          /* depth */ 2,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 1>(),
+      Op<opmath_t>(),
+      scalar.data_ptr<T>(),
+      alpha.to<opmath_t>());
   return tensor_lists[1];
 }
 
@@ -84,19 +81,16 @@ void foreach_binary_op_(
   tensor_lists.emplace_back(tensors.vec());
 
   using opmath_t = at::opmath_type<T>;
-  DISPATCH_MULTI_TENSOR_APPLY([&]() {
-    multi_tensor_apply<1>(
-        tensor_lists,
-        BinaryOpScalarTensorFunctor<
-            T,
-            /* depth */ 1,
-            /* r_args_depth */ 1,
-            /* res_arg_index */ 0,
-            large_kernel_arg>(),
-        Op<opmath_t>(),
-        scalar.data_ptr<T>(),
-        alpha.to<opmath_t>());
-  });
+  multi_tensor_apply<1>(
+      tensor_lists,
+      BinaryOpScalarTensorFunctor<
+          T,
+          /* depth */ 1,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 0>(),
+      Op<opmath_t>(),
+      scalar.data_ptr<T>(),
+      alpha.to<opmath_t>());
   increment_version(tensors);
 }