Skip to content

Commit

Permalink
[microTVM] Modernize Arm Cortex-M convolution schedules (apache#13242)
Browse files Browse the repository at this point in the history
* Quantized Corstone300 test draft

* Add QNN strategy with operator fusion for Cortex-M

Get QNN strategy running

QNN strategy with operator fusion

* Add assembly tensordot code from other PR

Assembly tensordot from other PR

Tensordot offset support

Hand tested tensordot code

* Helper work to support microTVM TIR schedules

Formatting fixes

Don't use automatic AOT building when skipping pass

Assorted tech for scheduling with TIR

Hacky int16 support

* TIR schedule for microTVM conv2d

Bugged schedule implementation

Passing test!

Works for all 1x1 conv2ds!

External QNN operator altering

Debugging work

Pad with correct constant

Broadly functional conv2d

Reorganize quantize convolution test

* TIR schedule for microTVM depthwise_conv2d

Working depthwise convolution for strides=1

Working depthwise convolution!

* Clean up code

Support Python 3.7

Clean up code to prepare for review

* Break qnn.py into helper functions

* Finish reorganizing qnn.py

* Fix linting

* Remove residual debug code and fix linting

* Try repairing unit tests

* Run black to fix linting

* Address code review comments

* Second round of code review

Second round of code review

Fix tensordot opts test

* Address @areusch code review

* More code review

* Catch VWW model download with request hook
  • Loading branch information
guberti authored Dec 6, 2022
1 parent 8d31b25 commit bbba8d9
Show file tree
Hide file tree
Showing 19 changed files with 1,775 additions and 506 deletions.
17 changes: 17 additions & 0 deletions python/tvm/relay/op/nn/_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,23 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts):
return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)


# QNN ops
@reg.register_alter_op_layout("add")
def alter_op_layout_add(attrs, inputs, tinfos, out_type):
"""Alter the layout of a add op.
Useful for fusing the bias constant with an input zero point constant in a previous quantized
op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
"""
return topi.nn.qnn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)


@reg.register_alter_op_layout("qnn.requantize")
def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
"""Alter the layout of a requantization op."""
return topi.nn.qnn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)


# bitpack
@reg.register_compute("nn.bitpack")
def compute_bitpack(attrs, inputs, out_dtype):
Expand Down
1 change: 1 addition & 0 deletions python/tvm/relay/qnn/strategy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@
from __future__ import absolute_import as _abs

from .generic import *
from . import arm_cpu
from . import hexagon
72 changes: 72 additions & 0 deletions python/tvm/relay/qnn/strategy/arm_cpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Quantized operator strategy for Arm CPU.
As quantized op schedules, these are only used if the qnn.Legalize pass is disabled. The current
schedules only work for fused operators with bias, as this is the most common use case. Only
regular/depthwise conv2d is supported, but qnn_dense will be added eventually."""

from tvm import topi, TVMError
from .generic import qnn_conv2d_strategy
from ... import op as _op
from ...op.strategy.generic import is_depthwise_conv2d


@qnn_conv2d_strategy.register("arm_cpu")
def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
"""qnn.conv2d strategy for Arm Cortex-M CPUs with DSP.
When computing convolutions, we want data that will be used to compute the same output values to
be adjacent in memory, as this lets us reuse memory loads and use more SIMD instructions.
For depthwise convolutions, channels do not interact with each other, so the NCHW and IOHW
layouts to the best job of keeping "related" data close. In contrast, computing one output of a
regular convolution requires reading all input channels, so NHWC and OHWI are best. Hence, these
are the layouts we support.
"""

if not (target.features.has_dsp and "cortex-m" in target.mcpu):
raise TVMError(
"Quantized Arm schedules only exist for Cortex-M with DSP! "
"The qnn.Legalize pass should be run for other Arm processors."
)

data = inputs[0]
kernel = inputs[1]
data_layout = attrs.data_layout
kernel_layout = attrs.kernel_layout
groups = attrs.groups
strategy = _op.OpStrategy()

if groups == 1:
if data_layout == "NHWC" and kernel_layout == "OHWI":
strategy.add_implementation(
topi.arm_cpu.qnn_conv2d,
topi.arm_cpu.schedule_qnn_conv2d,
name="qnn_conv2d.arm_cpu",
)
elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
if data_layout == "NCHW" and kernel_layout == "IOHW":
strategy.add_implementation(
topi.arm_cpu.qnn_depthwise_conv2d,
topi.arm_cpu.schedule_qnn_depthwise_conv2d,
name="qnn_depthwise_conv2d.arm_cpu",
)
else:
raise TVMError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d")

return strategy
2 changes: 2 additions & 0 deletions python/tvm/topi/arm_cpu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@
from .conv2d_transpose import *
from .conv2d_int8 import *
from . import conv2d_alter_op
from . import qnn_alter_op
from .bitserial_conv2d import *
from .bitserial_dense import *
from .injective import *
from .group_conv2d import *
from .pooling import *
from .dense import *
from .qnn import *
18 changes: 0 additions & 18 deletions python/tvm/topi/arm_cpu/conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@
conv2d_nhwc_dsp_compute,
conv2d_nhwc_dsp_schedule,
)
from .mprofile.dsp.tensordot_conv2ds import (
conv2d_nhwc_ohwi_dsp_compute,
tensordot_conv2ds_schedule,
)


@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
Expand Down Expand Up @@ -522,17 +518,3 @@ def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
def schedule_conv2d_nhwc_dsp(cfg, outs):
"""Create schedule for conv2d_nhwc_dsp"""
return conv2d_nhwc_dsp_schedule(cfg, outs)


@autotvm.register_topi_compute("conv2d_nhwc_ohwi_dsp.arm_cpu")
def conv2d_nhwc_ohwi_dsp(cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype):
"""Compute conv2d_nhwc_ohwi with v7e-m DSP instructions and the tensordot kernel."""
return conv2d_nhwc_ohwi_dsp_compute(
cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
)


@autotvm.register_topi_schedule("conv2d_nhwc_ohwi_dsp.arm_cpu")
def schedule_conv2d_nhwc_ohwi_dsp(cfg, outs):
"""Create schedule for conv2d_nhwc_ohwi."""
return tensordot_conv2ds_schedule(cfg, outs)
20 changes: 0 additions & 20 deletions python/tvm/topi/arm_cpu/depthwise_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,6 @@
depthwise_conv2d_nhwc_dsp_compute,
depthwise_conv2d_nhwc_dsp_schedule,
)
from .mprofile.dsp.tensordot_conv2ds import (
depthwise_conv2d_nchw_oihw_dsp_compute,
tensordot_conv2ds_schedule,
)


@autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
Expand Down Expand Up @@ -722,19 +718,3 @@ def depthwise_conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out
def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs):
"""Create schedule for conv2d_nhwc_dsp"""
return depthwise_conv2d_nhwc_dsp_schedule(cfg, outs)


@autotvm.register_topi_compute("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
def depthwise_conv2d_nchw_oihw_dsp(
cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
):
"""Compute depthwise_conv2d_nchw_oihw with v7e-m DSP instructions and the tensordot kernel."""
return depthwise_conv2d_nchw_oihw_dsp_compute(
cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
)


@autotvm.register_topi_schedule("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
def schedule_depthwise_conv2d_nchw_oihw_dsp(cfg, outs):
"""Create schedule for depthwise_conv2d_nchw_oihw."""
return tensordot_conv2ds_schedule(cfg, outs)
Loading

0 comments on commit bbba8d9

Please sign in to comment.