[microTVM] Modernize Arm Cortex-M convolution schedules (apache#13242)

* Quantized Corstone300 test draft * Add QNN strategy with operator fusion for Cortex-M Get QNN strategy running QNN strategy with operator fusion * Add assembly tensordot code from other PR Assembly tensordot from other PR Tensordot offset support Hand tested tensordot code * Helper work to support microTVM TIR schedules Formatting fixes Don't use automatic AOT building when skipping pass Assorted tech for scheduling with TIR Hacky int16 support * TIR schedule for microTVM conv2d Bugged schedule implementation Passing test! Works for all 1x1 conv2ds! External QNN operator altering Debugging work Pad with correct constant Broadly functional conv2d Reorganize quantize convolution test * TIR schedule for microTVM depthwise_conv2d Working depthwise convolution for strides=1 Working depthwise convolution! * Clean up code Support Python 3.7 Clean up code to prepare for review * Break qnn.py into helper functions * Finish reorganizing qnn.py * Fix linting * Remove residual debug code and fix linting * Try repairing unit tests * Run black to fix linting * Address code review comments * Second round of code review Second round of code review Fix tensordot opts test * Address @areusch code review * More code review * Catch VWW model download with request hook
driazati · Dec 6, 2022 · bbba8d9 · bbba8d9
1 parent 8d31b25
commit bbba8d9
Show file tree

Hide file tree

Showing 19 changed files with 1,775 additions and 506 deletions.
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
@@ -877,6 +877,23 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts):
     return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)
 
 
+# QNN ops
+@reg.register_alter_op_layout("add")
+def alter_op_layout_add(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a add op.
+
+    Useful for fusing the bias constant with an input zero point constant in a previous quantized
+    op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
+    """
+    return topi.nn.qnn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)
+
+
+@reg.register_alter_op_layout("qnn.requantize")
+def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a requantization op."""
+    return topi.nn.qnn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)
+
+
 # bitpack
 @reg.register_compute("nn.bitpack")
 def compute_bitpack(attrs, inputs, out_dtype):

diff --git a/python/tvm/relay/qnn/strategy/__init__.py b/python/tvm/relay/qnn/strategy/__init__.py
@@ -20,4 +20,5 @@
 from __future__ import absolute_import as _abs
 
 from .generic import *
+from . import arm_cpu
 from . import hexagon
diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Quantized operator strategy for Arm CPU.
+
+As quantized op schedules, these are only used if the qnn.Legalize pass is disabled. The current
+schedules only work for fused operators with bias, as this is the most common use case. Only
+regular/depthwise conv2d is supported, but qnn_dense will be added eventually."""
+
+from tvm import topi, TVMError
+from .generic import qnn_conv2d_strategy
+from ... import op as _op
+from ...op.strategy.generic import is_depthwise_conv2d
+
+
+@qnn_conv2d_strategy.register("arm_cpu")
+def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
+    """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP.
+
+    When computing convolutions, we want data that will be used to compute the same output values to
+    be adjacent in memory, as this lets us reuse memory loads and use more SIMD instructions.
+
+    For depthwise convolutions, channels do not interact with each other, so the NCHW and IOHW
+    layouts to the best job of keeping "related" data close. In contrast, computing one output of a
+    regular convolution requires reading all input channels, so NHWC and OHWI are best. Hence, these
+    are the layouts we support.
+    """
+
+    if not (target.features.has_dsp and "cortex-m" in target.mcpu):
+        raise TVMError(
+            "Quantized Arm schedules only exist for Cortex-M with DSP! "
+            "The qnn.Legalize pass should be run for other Arm processors."
+        )
+
+    data = inputs[0]
+    kernel = inputs[1]
+    data_layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    groups = attrs.groups
+    strategy = _op.OpStrategy()
+
+    if groups == 1:
+        if data_layout == "NHWC" and kernel_layout == "OHWI":
+            strategy.add_implementation(
+                topi.arm_cpu.qnn_conv2d,
+                topi.arm_cpu.schedule_qnn_conv2d,
+                name="qnn_conv2d.arm_cpu",
+            )
+    elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
+        if data_layout == "NCHW" and kernel_layout == "IOHW":
+            strategy.add_implementation(
+                topi.arm_cpu.qnn_depthwise_conv2d,
+                topi.arm_cpu.schedule_qnn_depthwise_conv2d,
+                name="qnn_depthwise_conv2d.arm_cpu",
+            )
+    else:
+        raise TVMError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d")
+
+    return strategy
diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py
@@ -23,9 +23,11 @@
 from .conv2d_transpose import *
 from .conv2d_int8 import *
 from . import conv2d_alter_op
+from . import qnn_alter_op
 from .bitserial_conv2d import *
 from .bitserial_dense import *
 from .injective import *
 from .group_conv2d import *
 from .pooling import *
 from .dense import *
+from .qnn import *
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
@@ -37,10 +37,6 @@
     conv2d_nhwc_dsp_compute,
     conv2d_nhwc_dsp_schedule,
 )
-from .mprofile.dsp.tensordot_conv2ds import (
-    conv2d_nhwc_ohwi_dsp_compute,
-    tensordot_conv2ds_schedule,
-)
 
 
 @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
@@ -522,17 +518,3 @@ def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
 def schedule_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return conv2d_nhwc_dsp_schedule(cfg, outs)
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_ohwi_dsp.arm_cpu")
-def conv2d_nhwc_ohwi_dsp(cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype):
-    """Compute conv2d_nhwc_ohwi with v7e-m DSP instructions and the tensordot kernel."""
-    return conv2d_nhwc_ohwi_dsp_compute(
-        cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_ohwi_dsp.arm_cpu")
-def schedule_conv2d_nhwc_ohwi_dsp(cfg, outs):
-    """Create schedule for conv2d_nhwc_ohwi."""
-    return tensordot_conv2ds_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -31,10 +31,6 @@
     depthwise_conv2d_nhwc_dsp_compute,
     depthwise_conv2d_nhwc_dsp_schedule,
 )
-from .mprofile.dsp.tensordot_conv2ds import (
-    depthwise_conv2d_nchw_oihw_dsp_compute,
-    tensordot_conv2ds_schedule,
-)
 
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
@@ -722,19 +718,3 @@ def depthwise_conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out
 def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return depthwise_conv2d_nhwc_dsp_schedule(cfg, outs)
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-def depthwise_conv2d_nchw_oihw_dsp(
-    cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-):
-    """Compute depthwise_conv2d_nchw_oihw with v7e-m DSP instructions and the tensordot kernel."""
-    return depthwise_conv2d_nchw_oihw_dsp_compute(
-        cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-    )
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-def schedule_depthwise_conv2d_nchw_oihw_dsp(cfg, outs):
-    """Create schedule for depthwise_conv2d_nchw_oihw."""
-    return tensordot_conv2ds_schedule(cfg, outs)