Add necessary Concat-bn-relu fusion checks to runtime kernel (#546)

Wei-Lin-Intel · web-flow · commit a04447930b58 · 2022-02-21T11:14:03.000+08:00
* Add fusion checks back to runtime kernel

* restore xu's dyn patch for concat-bn-relu fusion

* fix typo

* adding warmup runs before UT checks for test_jit.py
diff --git a/intel_extension_for_pytorch/csrc/aten/cpu/kernels/jit_kernels/ConcatBnReluKrnl.cpp b/intel_extension_for_pytorch/csrc/aten/cpu/kernels/jit_kernels/ConcatBnReluKrnl.cpp
@@ -16,8 +16,9 @@ namespace {
 
 at::Tensor concat_bn_relu_kernel_impl(
     const c10::List<at::Tensor>& a,
+    const at::Tensor& bn_scale,
     const at::Tensor& bn_beta,
-    const c10::optional<at::Tensor>& bn_scale,
+    const c10::optional<at::Tensor>& bn_weight,
     const c10::optional<at::Tensor>& bn_bias,
     const c10::optional<at::Tensor>& bn_mean,
     const c10::optional<at::Tensor>& bn_var,
@@ -27,36 +28,49 @@ at::Tensor concat_bn_relu_kernel_impl(
     bool bn_cudnn_enabled,
     int dim) {
   int64_t list_length = a.size();
+  std::vector<int64_t> output_dim = a[0].sizes().vec();
+  int64_t tensor_length = a[0].ndimension();
 
-  c10::MaybeOwned<at::Tensor> weight_maybe_owned =
-      at::borrow_from_optional_tensor(bn_scale);
-  const at::Tensor& bn_weight = *weight_maybe_owned;
-  std::vector<long int> output_dim(a[0].ndimension());
-  for (int64_t i = 0; i < list_length; ++i) {
-    output_dim[1] += a[i].size(1);
-  }
-  for (int64_t i = 0; i < a[0].ndimension(); ++i) {
-    if (i != 1) {
-      output_dim[i] = a[0].size(i);
+  // Check if the memory format is channelslast(3d) and if the channel size can
+  // be divided by 16
+  auto check_format_channelsize = [](at::Tensor tensor) {
+    return (
+        (tensor.suggest_memory_format() == at::MemoryFormat::ChannelsLast ||
+         tensor.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d) &&
+        tensor.size(1) % 16 == 0);
+  };
+
+  // Check the first tensor
+  bool tensor_check = check_format_channelsize(a[0]);
+  // Check the rest input tensors
+  for (int64_t i = 1; i < list_length; ++i) {
+    tensor_check = (tensor_check && check_format_channelsize(a[i]));
+    for (int64_t j = 0; j < tensor_length; ++j) {
+      if (j == 1) {
+        output_dim[1] += a[i].size(j);
+      } else {
+        tensor_check = (tensor_check && a[i].size(j) == a[0].size(j));
+      }
     }
   }
-  at::Tensor output = at::empty(
-      output_dim,
-      a[0].options()
-          .dtype(at::kFloat)
-          .memory_format(a[0].suggest_memory_format()));
-
 #if defined(CPU_CAPABILITY_AVX512)
-  torch_ipex::cpu::kernel::vec::vec512::ConcatBnReluKernelImpl_ChannelsLast<
-      float>(a, bn_weight, bn_beta, output);
-  return output;
-#else
+  if (tensor_check) {
+    at::Tensor output = at::empty(
+        output_dim,
+        a[0].options()
+            .dtype(at::kFloat)
+            .memory_format(a[0].suggest_memory_format()));
+    torch_ipex::cpu::kernel::vec::vec512::ConcatBnReluKernelImpl_ChannelsLast<
+        float>(a, bn_scale, bn_beta, output);
+    return output;
+  }
+#endif
   std::vector<at::Tensor> concat_input(list_length);
   for (int64_t i = 0; i < list_length; ++i)
     concat_input[i] = a[i];
   auto bn_res = at::batch_norm(
       at::cat(concat_input, (int64_t)dim),
-      bn_scale,
+      bn_weight,
       bn_bias,
       bn_mean,
       bn_var,
@@ -65,7 +79,6 @@ at::Tensor concat_bn_relu_kernel_impl(
       bn_eps,
       bn_cudnn_enabled);
   return at::relu(bn_res);
-#endif
 }
 
 #if defined(DYN_DISP_BUILD)
diff --git a/intel_extension_for_pytorch/csrc/cpu/vec512/concat_bn_relu.h b/intel_extension_for_pytorch/csrc/cpu/vec512/concat_bn_relu.h
@@ -64,7 +64,7 @@ void ConcatBnReluKernelImpl_ChannelsLast(
 
   for (int64_t i = 0; i < list_length; ++i) {
     input_channels[i + 1] = input_channels[i] + a[i].size(1);
-    input_ptr[i] = a[i].data_ptr<T>();
+    input_ptr[i] = a[i].contiguous(a[i].suggest_memory_format()).data_ptr<T>();
   }
   //  Return the product of all the input dimensions except for the channel
   //  and check if the dimension and sizes of the tensors meet the fusion
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConcatBnRelu.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConcatBnRelu.cpp
@@ -23,8 +23,9 @@ DEFINE_DISPATCH(concat_bn_relu_kernel_stub);
  **/
 at::Tensor ConcatBnRelu(
     const c10::List<at::Tensor>& a,
+    const at::Tensor& bn_scale,
     const at::Tensor& bn_beta,
-    const c10::optional<at::Tensor>& bn_scale,
+    const c10::optional<at::Tensor>& bn_weight,
     const c10::optional<at::Tensor>& bn_bias,
     const c10::optional<at::Tensor>& bn_mean,
     const c10::optional<at::Tensor>& bn_var,
@@ -33,14 +34,15 @@ at::Tensor ConcatBnRelu(
     double bn_eps,
     bool bn_cudnn_enabled,
     int dim) {
-  IPEX_RECORD_FUNCTION("ConcatBnRelu", std::vector<c10::IValue>({}));
+  IPEX_RECORD_FUNCTION("ipex::concat_bn_relu", std::vector<c10::IValue>({}));
 
 #if defined(DYN_DISP_BUILD)
   return concat_bn_relu_kernel_stub(
       kCPU,
       a,
-      bn_beta,
       bn_scale,
+      bn_beta,
+      bn_weight,
       bn_bias,
       bn_mean,
       bn_var,
@@ -52,8 +54,9 @@ at::Tensor ConcatBnRelu(
 #else
   return concat_bn_relu_kernel_impl(
       a,
-      bn_beta,
       bn_scale,
+      bn_beta,
+      bn_weight,
       bn_bias,
       bn_mean,
       bn_var,
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConcatBnRelu.h b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/ConcatBnRelu.h
@@ -15,8 +15,9 @@ namespace cpu {
  * */
 at::Tensor ConcatBnRelu(
     const c10::List<at::Tensor>& a,
+    const at::Tensor& bn_scale,
     const at::Tensor& bn_beta,
-    const c10::optional<at::Tensor>& bn_scale,
+    const c10::optional<at::Tensor>& bn_weight,
     const c10::optional<at::Tensor>& bn_bias,
     const c10::optional<at::Tensor>& bn_mean,
     const c10::optional<at::Tensor>& bn_var,
@@ -32,8 +33,9 @@ namespace {
 
 at::Tensor concat_bn_relu_kernel_impl(
     const c10::List<at::Tensor>& a,
+    const at::Tensor& bn_scale,
     const at::Tensor& bn_beta,
-    const c10::optional<at::Tensor>& bn_scale,
+    const c10::optional<at::Tensor>& bn_weight,
     const c10::optional<at::Tensor>& bn_bias,
     const c10::optional<at::Tensor>& bn_mean,
     const c10::optional<at::Tensor>& bn_var,
@@ -50,6 +52,7 @@ at::Tensor concat_bn_relu_kernel_impl(
 using concat_bn_relu_kernel_fn = at::Tensor (*)(
     const c10::List<at::Tensor>&,
     const at::Tensor&,
+    const at::Tensor&,
     const c10::optional<at::Tensor>&,
     const c10::optional<at::Tensor>&,
     const c10::optional<at::Tensor>&,
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.cpp
@@ -444,10 +444,10 @@ void FuseConcatBnRelu(std::shared_ptr<Graph>& graph) {
         %alpha: int = prim::Constant[value=1]()
         %u1 = aten::add(%running_var, %eps, %alpha)
         %u2 = aten::sqrt(%u1)
-        %u3 = aten::div(%running_mean, %u2)
-        %u4 = aten::mul(%weight, %u3)
-        %beta = aten::sub(%bias, %u4, %alpha)
-        %b = ipex::concat_bn_relu(%input, %beta, %weight, %bias, %running_mean, %running_var, %training, %momentum, %eps, %cudnn_enabled, %dim)
+        %scale = aten::div(%weight, %u2)
+        %u3 = aten::mul(%running_mean, %scale)
+        %beta = aten::sub(%bias, %u3, %alpha)
+        %b = ipex::concat_bn_relu(%input, %scale, %beta, %weight, %bias, %running_mean, %running_var, %training, %momentum, %eps, %cudnn_enabled, %dim)
         return (%b) )";
 
   auto fusion_filter = [](const Match& match,
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp
@@ -675,24 +675,25 @@ RegisterOperators op({
         },
         aliasAnalysisFromSchema()),
     Operator(
-        "ipex::concat_bn_relu(Tensor[] a, Tensor bn_beta, "
+        "ipex::concat_bn_relu(Tensor[] a, Tensor bn_scale, Tensor bn_beta, "
         "Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled, int dim) -> "
         "Tensor",
         [](const Node* node) -> Operation {
           return [](Stack* stack) {
             auto result = ConcatBnRelu(
-                (std::move(peek(stack, 0, 11))).toTensorList(),
-                (std::move(peek(stack, 1, 11))).toTensor(),
-                toOptionalTensor(std::move(peek(stack, 2, 11))),
-                toOptionalTensor(std::move(peek(stack, 3, 11))),
-                toOptionalTensor(std::move(peek(stack, 4, 11))),
-                toOptionalTensor(std::move(peek(stack, 5, 11))),
-                (std::move(peek(stack, 6, 11))).toBool(),
-                (std::move(peek(stack, 7, 11))).toDouble(),
-                (std::move(peek(stack, 8, 11))).toDouble(),
-                (std::move(peek(stack, 9, 11))).toBool(),
-                (std::move(peek(stack, 10, 11))).toInt());
-            drop(stack, 11);
+                (std::move(peek(stack, 0, 12))).toTensorList(),
+                (std::move(peek(stack, 1, 12))).toTensor(),
+                (std::move(peek(stack, 2, 12))).toTensor(),
+                toOptionalTensor(std::move(peek(stack, 3, 12))),
+                toOptionalTensor(std::move(peek(stack, 4, 12))),
+                toOptionalTensor(std::move(peek(stack, 5, 12))),
+                toOptionalTensor(std::move(peek(stack, 6, 12))),
+                (std::move(peek(stack, 7, 12))).toBool(),
+                (std::move(peek(stack, 8, 12))).toDouble(),
+                (std::move(peek(stack, 9, 12))).toDouble(),
+                (std::move(peek(stack, 10, 12))).toBool(),
+                (std::move(peek(stack, 11, 12))).toInt());
+            drop(stack, 12);
             pack(stack, std::move(result));
             return 0;
           };
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
@@ -961,46 +961,6 @@ def test_add_layernorm(self):
         node = "ipex::add_layernorm"
         self.assertTrue(any(n.kind() == node for n in trace_graph.nodes()))
 
-    def _test_concat_bn_relu(self, a1, a2, a3, enable_3d=True, use_channels_last=True):
-        if enable_3d:
-            if use_channels_last:
-                model = ConcatBnRelu3d().eval().to(memory_format=torch.channels_last_3d)
-                model = ipex.optimize(model, dtype=torch.float32, level='O0')
-                with torch.no_grad():
-                    jit_model = torch.jit.trace(model, (a1, a2, a3)).eval()
-                    jit_model = torch.jit.freeze(jit_model)
-                jit_res = jit_model(a1, a2, a3)
-                ori_res = model(a1, a2, a3)
-                self.assertEqual(jit_res, ori_res)
-            else:
-                model = ConcatBnRelu3d().eval()
-                model = ipex.optimize(model, dtype=torch.float32, level='O0')
-                with torch.no_grad():
-                    jit_model = torch.jit.trace(model, (a1, a2, a3)).eval()
-                    jit_model = torch.jit.freeze(jit_model)
-                jit_res = jit_model(a1, a2, a3)
-                ori_res = model(a1, a2, a3)
-                self.assertEqual(jit_res, ori_res)
-        else:
-            if use_channels_last:
-                model = ConcatBnRelu2d().eval().to(memory_format=torch.channels_last)
-                model = ipex.optimize(model, dtype=torch.float32, level='O0')
-                with torch.no_grad():
-                    jit_model = torch.jit.trace(model, (a1, a2, a3)).eval()
-                    jit_model = torch.jit.freeze(jit_model)
-                jit_res = jit_model(a1, a2, a3)
-                ori_res = model(a1, a2, a3)
-                self.assertEqual(jit_res, ori_res)
-            else:
-                model = ConcatBnRelu2d().eval()
-                model = ipex.optimize(model, dtype=torch.float32, level='O0')
-                with torch.no_grad():
-                    jit_model = torch.jit.trace(model, (a1, a2, a3)).eval()
-                    jit_model = torch.jit.freeze(jit_model)
-                jit_res = jit_model(a1, a2, a3)
-                ori_res = model(a1, a2, a3)
-                self.assertEqual(jit_res, ori_res)
-
     def test_concat_bn_relu(self):
         a1 = torch.randn(1, 32, 13, 24, dtype=torch.bfloat16).contiguous(memory_format=torch.channels_last)
         a2 = torch.randn(1, 32, 13, 24, dtype=torch.bfloat16).contiguous(memory_format=torch.channels_last)
@@ -1010,8 +970,10 @@ def test_concat_bn_relu(self):
         with torch.no_grad():
             jit_model = torch.jit.trace(model, (a1, a2, a3)).eval()
             jit_model = torch.jit.freeze(jit_model)
-        jit_res = jit_model(a1, a2, a3)
-        ori_res = model(a1, a2, a3)
+            #warmup run
+            for _ in range(2):
+                jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
         self.assertEqual(jit_res, ori_res)
 
         a1 = torch.randn(1, 32, 13, 24, dtype=torch.float).contiguous(memory_format=torch.channels_last)
@@ -1022,46 +984,92 @@ def test_concat_bn_relu(self):
         with torch.no_grad():
             jit_model = torch.jit.trace(model, (a1, a2, a3)).eval()
             jit_model = torch.jit.freeze(jit_model)
-        jit_res = jit_model(a1, a2, a3)
-        ori_res = model(a1, a2, a3)
+            #warmup run
+            for _ in range(2):
+                jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
         self.assertEqual(jit_res, ori_res)
 
-        self._test_concat_bn_relu(a1, a2, a3, enable_3d=False, use_channels_last=True)
+        model = ConcatBnRelu2d().eval().to(memory_format=torch.channels_last)
+        model = ipex.optimize(model, dtype=torch.float32, level='O0')
+        with torch.no_grad():
+            jit_model = torch.jit.trace(model, (a1, a2, a3)).eval()
+            jit_model = torch.jit.freeze(jit_model)
+            #warmup run
+            for _ in range(2):
+                jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
+        self.assertEqual(jit_res, ori_res)
 
-        a1 = torch.randn(1, 16, 13, 24, dtype=torch.float).contiguous(memory_format=torch.channels_last)
-        a2 = torch.randn(1, 48, 13, 24, dtype=torch.float).contiguous(memory_format=torch.channels_last)
-        a3 = torch.randn(1, 32, 13, 24, dtype=torch.float).contiguous(memory_format=torch.channels_last)
-        self._test_concat_bn_relu(a1, a2, a3, enable_3d=False, use_channels_last=True)
+        a1 = torch.randn(1, 32, 18, 53, dtype=torch.float).contiguous(memory_format=torch.channels_last)
+        a2 = torch.randn(1, 32, 18, 53, dtype=torch.float).contiguous(memory_format=torch.channels_last)
+        a3 = torch.randn(1, 32, 18, 53, dtype=torch.float).contiguous(memory_format=torch.channels_last)
+        with torch.no_grad():
+            jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
+        self.assertEqual(jit_res, ori_res)
 
-        a1 = torch.randn(1, 17, 13, 24, dtype=torch.float).contiguous(memory_format=torch.channels_last)
-        a2 = torch.randn(1, 47, 13, 24, dtype=torch.float).contiguous(memory_format=torch.channels_last)
-        a3 = torch.randn(1, 32, 13, 24, dtype=torch.float).contiguous(memory_format=torch.channels_last)
-        self._test_concat_bn_relu(a1, a2, a3, enable_3d=False, use_channels_last=True)
+        a1 = torch.randn(1, 16, 24, 116, dtype=torch.float).contiguous(memory_format=torch.channels_last)
+        a2 = torch.randn(1, 48, 24, 116, dtype=torch.float).contiguous(memory_format=torch.channels_last)
+        a3 = torch.randn(1, 32, 24, 116, dtype=torch.float).contiguous(memory_format=torch.channels_last)
+        with torch.no_grad():
+            jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
+        self.assertEqual(jit_res, ori_res)
+
+        a1 = torch.randn(1, 17, 15, 24, dtype=torch.float).contiguous(memory_format=torch.channels_last)
+        a2 = torch.randn(1, 47, 15, 24, dtype=torch.float).contiguous(memory_format=torch.channels_last)
+        a3 = torch.randn(1, 32, 15, 24, dtype=torch.float).contiguous(memory_format=torch.channels_last)
+        with torch.no_grad():
+            jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
+        self.assertEqual(jit_res, ori_res)
 
         a1 = torch.randn(1, 32, 13, 24, dtype=torch.float)
         a2 = torch.randn(1, 32, 13, 24, dtype=torch.float)
         a3 = torch.randn(1, 32, 13, 24, dtype=torch.float)
-        self._test_concat_bn_relu(a1, a2, a3, enable_3d=False, use_channels_last=False)
+        with torch.no_grad():
+            jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
+        self.assertEqual(jit_res, ori_res)
 
         a1 = torch.randn(1, 32, 13, 24, 33, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
         a2 = torch.randn(1, 32, 13, 24, 33, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
         a3 = torch.randn(1, 32, 13, 24, 33, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
-        self._test_concat_bn_relu(a1, a2, a3, enable_3d=True, use_channels_last=True)
+        model = ConcatBnRelu3d().eval().to(memory_format=torch.channels_last_3d)
+        model = ipex.optimize(model, dtype=torch.float32, level='O0')
+        with torch.no_grad():
+            jit_model = torch.jit.trace(model, (a1, a2, a3)).eval()
+            jit_model = torch.jit.freeze(jit_model)
+            #warmup run
+            for _ in range(2):
+                jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
+        self.assertEqual(jit_res, ori_res)
 
-        a1 = torch.randn(1, 16, 13, 24, 33, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
-        a2 = torch.randn(1, 48, 13, 24, 33, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
-        a3 = torch.randn(1, 32, 13, 24, 33, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
-        self._test_concat_bn_relu(a1, a2, a3, enable_3d=True, use_channels_last=True)
+        a1 = torch.randn(1, 16, 17, 14, 31, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
+        a2 = torch.randn(1, 48, 17, 14, 31, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
+        a3 = torch.randn(1, 32, 17, 14, 31, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
+        with torch.no_grad():
+            jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
+        self.assertEqual(jit_res, ori_res)
 
         a1 = torch.randn(1, 17, 13, 24, 33, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
         a2 = torch.randn(1, 47, 13, 24, 33, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
         a3 = torch.randn(1, 32, 13, 24, 33, dtype=torch.float).contiguous(memory_format=torch.channels_last_3d)
-        self._test_concat_bn_relu(a1, a2, a3, enable_3d=True, use_channels_last=True)
+        with torch.no_grad():
+            jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
+        self.assertEqual(jit_res, ori_res)
 
         a1 = torch.randn(1, 32, 13, 24, 33, dtype=torch.float)
         a2 = torch.randn(1, 32, 13, 24, 33, dtype=torch.float)
         a3 = torch.randn(1, 32, 13, 24, 33, dtype=torch.float)
-        self._test_concat_bn_relu(a1, a2, a3, enable_3d=True, use_channels_last=False)
+        with torch.no_grad():
+            jit_res = jit_model(a1, a2, a3)
+            ori_res = model(a1, a2, a3)
+        self.assertEqual(jit_res, ori_res)
 
     def test_mha_scores_calculation(self):
         def _check_match_mha(trace_model, mat1, mat2, bias, node = "ipex::mha_scores_calc"):

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ void ConcatBnReluKernelImpl_ChannelsLast(`
`64`	`64`
`65`	`65`	`for (int64_t i = 0; i < list_length; ++i) {`
`66`	`66`	`input_channels[i + 1] = input_channels[i] + a[i].size(1);`
`67`		`- input_ptr[i] = a[i].data_ptr<T>();`
	`67`	`+ input_ptr[i] = a[i].contiguous(a[i].suggest_memory_format()).data_ptr<T>();`
`68`	`68`	`}`
`69`	`69`	`// Return the product of all the input dimensions except for the channel`
`70`	`70`	`// and check if the dimension and sizes of the tensors meet the fusion`