diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 53aec11e5816..e956c82828c1 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -877,6 +877,23 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts):
     return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)
 
 
+# QNN ops
+@reg.register_alter_op_layout("add")
+def alter_op_layout_add(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a add op.
+
+    Useful for fusing the bias constant with an input zero point constant in a previous quantized
+    op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
+    """
+    return topi.nn.qnn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)
+
+
+@reg.register_alter_op_layout("qnn.requantize")
+def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a requantization op."""
+    return topi.nn.qnn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)
+
+
 # bitpack
 @reg.register_compute("nn.bitpack")
 def compute_bitpack(attrs, inputs, out_dtype):
diff --git a/python/tvm/relay/qnn/strategy/__init__.py b/python/tvm/relay/qnn/strategy/__init__.py
index 05778c3e9f86..d7b669a4fa42 100644
--- a/python/tvm/relay/qnn/strategy/__init__.py
+++ b/python/tvm/relay/qnn/strategy/__init__.py
@@ -20,4 +20,5 @@
 from __future__ import absolute_import as _abs
 
 from .generic import *
+from . import arm_cpu
 from . import hexagon
diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py
new file mode 100644
index 000000000000..f8653817835e
--- /dev/null
+++ b/python/tvm/relay/qnn/strategy/arm_cpu.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Quantized operator strategy for Arm CPU.
+
+As quantized op schedules, these are only used if the qnn.Legalize pass is disabled. The current
+schedules only work for fused operators with bias, as this is the most common use case. Only
+regular/depthwise conv2d is supported, but qnn_dense will be added eventually."""
+
+from tvm import topi, TVMError
+from .generic import qnn_conv2d_strategy
+from ... import op as _op
+from ...op.strategy.generic import is_depthwise_conv2d
+
+
+@qnn_conv2d_strategy.register("arm_cpu")
+def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
+    """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP.
+
+    When computing convolutions, we want data that will be used to compute the same output values to
+    be adjacent in memory, as this lets us reuse memory loads and use more SIMD instructions.
+
+    For depthwise convolutions, channels do not interact with each other, so the NCHW and IOHW
+    layouts to the best job of keeping "related" data close. In contrast, computing one output of a
+    regular convolution requires reading all input channels, so NHWC and OHWI are best. Hence, these
+    are the layouts we support.
+    """
+
+    if not (target.features.has_dsp and "cortex-m" in target.mcpu):
+        raise TVMError(
+            "Quantized Arm schedules only exist for Cortex-M with DSP! "
+            "The qnn.Legalize pass should be run for other Arm processors."
+        )
+
+    data = inputs[0]
+    kernel = inputs[1]
+    data_layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    groups = attrs.groups
+    strategy = _op.OpStrategy()
+
+    if groups == 1:
+        if data_layout == "NHWC" and kernel_layout == "OHWI":
+            strategy.add_implementation(
+                topi.arm_cpu.qnn_conv2d,
+                topi.arm_cpu.schedule_qnn_conv2d,
+                name="qnn_conv2d.arm_cpu",
+            )
+    elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
+        if data_layout == "NCHW" and kernel_layout == "IOHW":
+            strategy.add_implementation(
+                topi.arm_cpu.qnn_depthwise_conv2d,
+                topi.arm_cpu.schedule_qnn_depthwise_conv2d,
+                name="qnn_depthwise_conv2d.arm_cpu",
+            )
+    else:
+        raise TVMError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d")
+
+    return strategy
diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py
index 20f92a8895dd..eba102662bc4 100644
--- a/python/tvm/topi/arm_cpu/__init__.py
+++ b/python/tvm/topi/arm_cpu/__init__.py
@@ -23,9 +23,11 @@
 from .conv2d_transpose import *
 from .conv2d_int8 import *
 from . import conv2d_alter_op
+from . import qnn_alter_op
 from .bitserial_conv2d import *
 from .bitserial_dense import *
 from .injective import *
 from .group_conv2d import *
 from .pooling import *
 from .dense import *
+from .qnn import *
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
index fc46f4b34f9d..ab489161a8fa 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -37,10 +37,6 @@
     conv2d_nhwc_dsp_compute,
     conv2d_nhwc_dsp_schedule,
 )
-from .mprofile.dsp.tensordot_conv2ds import (
-    conv2d_nhwc_ohwi_dsp_compute,
-    tensordot_conv2ds_schedule,
-)
 
 
 @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
@@ -522,17 +518,3 @@ def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
 def schedule_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return conv2d_nhwc_dsp_schedule(cfg, outs)
-
-
-@autotvm.register_topi_compute("conv2d_nhwc_ohwi_dsp.arm_cpu")
-def conv2d_nhwc_ohwi_dsp(cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype):
-    """Compute conv2d_nhwc_ohwi with v7e-m DSP instructions and the tensordot kernel."""
-    return conv2d_nhwc_ohwi_dsp_compute(
-        cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-    )
-
-
-@autotvm.register_topi_schedule("conv2d_nhwc_ohwi_dsp.arm_cpu")
-def schedule_conv2d_nhwc_ohwi_dsp(cfg, outs):
-    """Create schedule for conv2d_nhwc_ohwi."""
-    return tensordot_conv2ds_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index 9284b9474513..b6c15a30c037 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -31,10 +31,6 @@
     depthwise_conv2d_nhwc_dsp_compute,
     depthwise_conv2d_nhwc_dsp_schedule,
 )
-from .mprofile.dsp.tensordot_conv2ds import (
-    depthwise_conv2d_nchw_oihw_dsp_compute,
-    tensordot_conv2ds_schedule,
-)
 
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
@@ -722,19 +718,3 @@ def depthwise_conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out
 def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return depthwise_conv2d_nhwc_dsp_schedule(cfg, outs)
-
-
-@autotvm.register_topi_compute("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-def depthwise_conv2d_nchw_oihw_dsp(
-    cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-):
-    """Compute depthwise_conv2d_nchw_oihw with v7e-m DSP instructions and the tensordot kernel."""
-    return depthwise_conv2d_nchw_oihw_dsp_compute(
-        cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-    )
-
-
-@autotvm.register_topi_schedule("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-def schedule_depthwise_conv2d_nchw_oihw_dsp(cfg, outs):
-    """Create schedule for depthwise_conv2d_nchw_oihw."""
-    return tensordot_conv2ds_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
index 0fdffc06cf4f..1d36e1dd1e9c 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
@@ -14,142 +14,391 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel layout is OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
 
+This function can be used to tensorize many common operators including regular conv2d, depthwise
+conv2d, and grouped conv2d for some data and kernel layouts. When for regular convolution, use data
+layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout data layout is NCHW
+and kernel layout OIHW.
+
+The generated code will also work on v8-M chips that have the DSP instructions (unlike v7E-M, they
+are optional in v8-M). Note that the generated code does not use the (potentially very useful) MVE
+instructions present on some v8-M chips.
+"""
+
+from dataclasses import dataclass
+from itertools import chain
 import textwrap
+from typing import Iterator, Optional, Tuple
 
-from tvm import te, tir
 
-from .common import num_simd_lanes_per_word
+@dataclass
+class SMLAInstruction:
+    """Class for keeping track of an item in inventory."""
 
+    instruction: str
+    tensor_var: str
+    kernel_var: str
 
-def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix):
-    """Gets the C function name of the tensordot function."""
-    return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}"
+    def call_with_acle(self, accumulator_var: str) -> str:
+        return (
+            f"{accumulator_var} = __{self.instruction}"
+            f"({self.tensor_var}, {self.kernel_var}, {accumulator_var});"
+        )
 
+    def has_same_operands(self, other: "SMLAInstruction") -> bool:
+        return self.tensor_var == other.tensor_var and self.kernel_var == other.kernel_var
 
-def make_intrin_tensordot(slices, strides, tensordot_params):
-    """Helper function for constructing tensordot intrinsic. We can't construct the whole thing here
-    (as multiple schedules use tensordot and each must build the intrinstic differently) but we can
-    build part here to simplify the code."""
 
-    # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params
-    data, kernel, output = slices
-    data_strides, kernel_strides = strides
+def _get_c_function_name(num_outputs, dimensions, offsets, x_strides):
+    """Generates a C function name for tensordot.
 
-    data_buf = tir.decl_buffer(
-        data.shape, data.dtype, name="data", offset_factor=1, strides=data_strides
-    )
-    kernel_buf = tir.decl_buffer(
-        kernel.shape,
-        kernel.dtype,
-        name="kernel",
-        offset_factor=1,
-        strides=kernel_strides,
-    )
-    output_buf = tir.decl_buffer(
-        output.shape, output.dtype, name="output", offset_factor=1, strides=[1]
+    We do not need a suffix, as the generated function will have an #include guard. Unlike other
+    microTVM operators, _get_c_function_name is never called externally.
+    """
+    tensor_w, kernel_h, kernel_w = dimensions
+    return (
+        f"tensordot_opt_x{num_outputs}_int16_w{tensor_w}_"
+        + f"{kernel_h}x{kernel_w}_"
+        + "".join(map(str, offsets))
+        + (f"_{x_strides[0]}_{x_strides[1]}" if num_outputs > 1 else "")
     )
 
-    def intrin_func(ins, outs):
-        builder = tir.ir_builder.create()
-        builder.emit(
-            tir.call_extern(
-                "int32",
-                _get_func_name(*tensordot_params),
-                outs[0].access_ptr("w"),
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-            )
-        )
-        return builder.get()
 
-    return te.decl_tensor_intrin(
-        output.op,
-        intrin_func,
-        binds={data: data_buf, kernel: kernel_buf, output: output_buf},
-    )
+def _init_biased_accumulators(num_outputs):
+    """Generates code to load the bias into the accumulators.
+
+    Addition is commutative, so we could add the bias before, during, or after performing our
+    multiply-accumulate operations. Where we add the bias does not change the overflow behavior.
+
+    Doing the bias add takes one cycle either way (if done at the beginning we can't use a SMULXY
+    trick to set sum_i to zero for "free"). However, doing it at the beginning frees up a register,
+    so we'll do it first.
+    """
+    assignments = [f"sum_{x:x} = *bias" for x in range(num_outputs)]
+    joined_assignments = ", ".join(assignments)
+    return f"int32_t {joined_assignments};"
+
+
+def _get_tensor_halfwords(dimensions, offset, num_outputs, in_stride) -> Iterator[Optional[Tuple]]:
+    """Gets the logical indices of the data that will be stored in memory at the tensor pointer.
+
+    Returns an Iterator of Optional[Tuple], while skipping over word-aligned pairs of unrelated
+    halfwords. The returned iterator is as short as possible while having even length and containing
+    all relevant tensor data. Tuples in the returned Iterator represent an (y, x) offset from the
+    top-left tensor position being used in this convolution. We need to be aware of the None values
+    so our code is correctly word-aligned.
+
+    One consequence of these requirements - each row in the tensor is broken into word-aligned pairs
+    of halfwords (which are later combined into full words). See the test cases (located in
+    tests/python/topi/python/test_topi_conv2d_tensordot_opts.py) for usage examples.
+    """
+
+    tensor_w, kernel_h, kernel_w = dimensions
+    max_x_val = (num_outputs - 1) * in_stride + kernel_w
+    halfwords = []
+
+    for y in range(kernel_h):
+        # If needed, pad so the beginning of the row is word-aligned
+        if (y * tensor_w + offset) % 2 == 1:
+            halfwords.append(None)
+
+        for x in range(max_x_val):
+            halfwords.append((y, x))
+
+        # If needed, pad so the row length is word aligned
+        if (y * tensor_w + offset + max_x_val) % 2 == 1:
+            halfwords.append(None)
+    return halfwords
+
+
+def _get_kernel_halfwords(dimensions, offset) -> Iterator[Optional[Tuple]]:
+    """Gets the logical indices of the data that will be stored in memory at the kernel pointer.
 
+    Returns an Iterator of Optional[Tuple]. The returned iterator is as short as possible while
+    having even length and containing all kernel data. Tuples in the returned Iterator represent
+    an (y, x) position in the kernel, while None values represent other, irrelevant data. We need
+    to be aware of the None values so our code is correctly word-aligned.
 
-def tensordot_impl(in_dtype: str, tensor_h: int, jump: int, tensor_w: int, suffix: str) -> str:
-    """Generates C code for taking the dot products of two `tensor_h` * `tensor_w` tensors. Also has
-    a `jump` argument that advances the pointer of one tensor by that many words after each row. The
-    `jump` and `tensor_w` values must be word-aligned for the input data type, as non-word-aligned
-    memory access is slow on the Cortex-M series. Depending on the input datatype, the code may
-    contain DSP instructions for Arm v7e-m. C code contains DSP instructions for Arm v7e-m. See
-    the below pseudocode for reference:
-
-    tensordot(out_ptr, dat_ptr, ker_ptr) {
-        sum = 0;
-        for (i = 0; i < tensor_h; i++) {
-            for (j = 0; j < tensor_w; j++) {
-                sum += (*dat_ptr++) * (*ker_ptr++);
-            }
-            dat_ptr += jump;
-        }
-        *out_ptr = sum;
-    }
+    See test cases in tests/python/topi/python/test_topi_conv2d_tensordot_opts.py for examples.
     """
+    _, kernel_h, kernel_w = dimensions
+    halfwords = []
 
-    simd_lanes = num_simd_lanes_per_word(in_dtype)
-    assert tensor_w % simd_lanes == 0
-    assert jump % simd_lanes == 0
+    # Kernel data starts `offset` places after the pointer value
+    if offset == 1:
+        halfwords.append(None)
 
-    if in_dtype == "int8":
-        inner_loop = """
-              uint32_t tensor_c20 = __SXTB16(tensor_batch);
-              uint32_t kernel_c20 = __SXTB16(kernel_batch);
-              sum = __SMLAD(tensor_c20, kernel_c20, sum);
+    for y in range(kernel_h):
+        for x in range(kernel_w):
+            halfwords.append((y, x))
 
-              uint32_t tensor_c31 = __SXTB16(__ROR(tensor_batch, 8));
-              uint32_t kernel_c31 = __SXTB16(__ROR(kernel_batch, 8));
-              sum = __SMLAD(tensor_c31, kernel_c31, sum);"""
+    # Make sure the returned iterator has even length by padding with an "unknown" value. We want
+    # even length as this corresponds to an integer number of int32 words.
+    if (kernel_h * kernel_w + offset) % 2 == 1:
+        halfwords.append(None)
+    return halfwords
 
-    elif in_dtype == "int16":
-        inner_loop = """
-              sum = __SMLAD(tensor_batch, kernel_batch, sum);"""
 
-    elif in_dtype == "int32":
-        inner_loop = """
-              // Compiles to a single MAC instruction
-              sum += tensor_batch * kernel_batch;"""
+def _get_int16_alias(position) -> str:
+    if position is None:
+        return "unknown"
+    y, x = position
+    return f"y{y:0>2x}_x{x:0>2x}"
+
+
+def _load_tensor_vars(halfwords, tensor_w) -> Iterator[str]:
+    assert len(halfwords) % 2 == 0
+    offset = int(not bool(halfwords[0]))
+
+    for i in range(0, len(halfwords), 2):
+        var_name = f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}"
+        y, x = halfwords[i + 1] or halfwords[i]
+        tensor_index = (y * tensor_w + x + offset) // 2
+        yield f"int32_t tensor__{var_name} = tensor[{tensor_index}];"
+
+
+def _load_kernel_vars(halfwords) -> Iterator[str]:
+    assert len(halfwords) % 2 == 0
+    for i in range(0, len(halfwords), 2):
+        var_name = f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}"
+        yield f"int32_t kernel__{var_name} = kernel[{i // 2}];"
+
+
+def _get_draft_macs(
+    kernel_dims, tensor_halfwords, kernel_halfwords, offset
+) -> Iterator[SMLAInstruction]:
+    """Generates unrolled MAC instructions to compute one tensordot sum.
+
+    Unrolling these loops increases code size a tiny bit (< 0.02 KB), but makes the generated code
+    much faster. The generated code does not use SIMD instructions - they are added later by
+    _apply_simd_optimizations.
+
+    We return an iterator of SMLAInstruction named tuples. Returning an iterator lets us do
+    optimizations by iterator chaining.
+    """
+
+    def get_var(y, x, halfwords) -> Tuple[str, str]:
+        i = halfwords.index((y, x))
+        if i % 2 == 0:
+            return f"{_get_int16_alias((y, x))}__{_get_int16_alias(halfwords[i + 1])}", "b"
+        return f"{_get_int16_alias(halfwords[i - 1])}__{_get_int16_alias((y, x))}", "t"
+
+    kernel_h, kernel_w = kernel_dims
+    for y in range(kernel_h):
+        for x in range(kernel_w):
+            tensor_var, tensor_half = get_var(y, x + offset, tensor_halfwords)
+            kernel_var, kernel_half = get_var(y, x, kernel_halfwords)
+            instruction = f"smla{tensor_half}{kernel_half}"
+            yield SMLAInstruction(instruction, f"tensor__{tensor_var}", f"kernel__{kernel_var}")
+
+
+def _apply_simd_optimizations(instruction_tuples) -> Iterator[SMLAInstruction]:
+    """When possible, fuses single MACs into SIMD MAC instructions.
+
+    The compiler cannot do this automatically, as calling __smlaxy forces the SMLAxy instruction to
+    be used. This function takes as input an iterator of SMLAInstructions and returns an iterator of
+    SMLAInstructions (possibly of different length).
+    """
+    curr_tuple = next(instruction_tuples, None)
+    while curr_tuple:
+        next_tuple = next(instruction_tuples, None)
+        if next_tuple is None:
+            yield curr_tuple
+            break
+
+        if curr_tuple.has_same_operands(next_tuple):
+            instructions = sorted([curr_tuple.instruction, next_tuple.instruction])
+            if instructions == ["smlabb", "smlatt"]:
+                yield SMLAInstruction("smlad", curr_tuple.tensor_var, curr_tuple.kernel_var)
+                next_tuple = next(instruction_tuples, None)
+            elif instructions == ["smlabt", "smlatb"]:
+                yield SMLAInstruction("smladx", curr_tuple.tensor_var, curr_tuple.kernel_var)
+                next_tuple = next(instruction_tuples, None)
+            else:
+                yield curr_tuple
+
+        else:
+            yield curr_tuple
+        curr_tuple = next_tuple
+
+
+def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
+    """Converts an iterator of SMLAInstructions into lines of C code.
+
+    We want the compiler to re-order these with the memory loads, so we generate them as a series of
+    calls to instruction aliases instead of as a single `asm` block.
+    """
+
+    for smla_instruction in instruction_tuples:
+        assert "smla" in smla_instruction.instruction
+
+        # We call the instruction using the Arm C Language Extensions. Using ACLE gives better
+        # cross-compiler compatibility than using __builtin functions.
+        yield smla_instruction.call_with_acle(f"sum_{index}")
+
+
+def _requantize_sums(num_outputs, requantize_shift, output_zero_point) -> Iterator[str]:
+    """Generates code to requantize the accumulator values.
+
+    The generated code does not use floating point instructions, as it simulates floating point
+    multiplication with an a int64 multiply + shift. The bias is added at the beginning, so we can
+    skip doing it now. The shift is hard-coded, as this saves a few cycles without hurting accuracy
+    in "most" cases.
+
+    It's *possible* we could save one more cycle here by pre-multiplying the bias with the
+    requantize multiplier, and then doing the bias addition and shift in the same cycle (via <op2>).
+    However, it's complicated and only saves one cycle.
+
+    It's also worth noting the SSAT16 operation doesn't help us here. The data isn't stored as two
+    halfwords in a word, and rearrainging it would take at least one cycle. Two SSAT operations is
+    just as good.
+
+    Calling __ssat directly is a little bit gross, but GCC and Clang are unreliable about compiling
+    other ways of writing this. Both the multiply + shift and shift + saturation combine to one
+    instruction each.
+    """
+
+    yield "int32_t scale_val = *scale;"
+    for i in range(num_outputs):
+        yield f"int32_t requant_{i} = (sum_{i} * (int64_t) scale_val) >> {requantize_shift - 1};"
+        yield f"requant_{i} = (requant_{i} + 1) >> 1;"
+        yield f"requant_{i} = __ssat(requant_{i} + {output_zero_point}, 8);"
+
+
+def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
+    """Generates code to write the requantized sums to memory.
+
+    Note - halfword packing here *does* help. It seems
+    like it wouldn't, as doing two pipelined int16 stores takes two cycles - the same as halfword
+    packing plus a pipelined int32 store. We still do the int16 stores when there is an output
+    stride, though.
+
+    However, this lets the compiler re-order instructions to better preserve memory, as it doesn't
+    like breaking apart the store instructions (as this messes up pipelining).
+    """
+
+    if stride > 1:
+        for i in range(num_outputs):
+            yield f"((int16_t*) output)[{i * stride + offset}] = (int16_t) requant_{i};"
 
     else:
-        raise ValueError(f"No tensordot implementation exists for dtype '{in_dtype}'!")
+        num_packed = (num_outputs - offset) // 2
+        for i in range(num_packed):
+            index = 2 * i + offset
+            yield f"int32_t packed_res_{i} = requant_{index} + (requant_{index + 1} << 16);"
 
-    function_name = _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix)
-    return textwrap.dedent(
-        (
-            f"""
-        #include <stdint.h>
-        #include <arm_nnsupportfunctions.h>
+        if offset == 1:
+            yield "((int16_t*) output)[1] = (int16_t) requant_0;"
 
-        #ifdef __cplusplus
-        extern "C"
-        #endif
-        __STATIC_FORCEINLINE int32_t {function_name}(
-            uint32_t *out,
-            uint32_t *tensor,
-            uint32_t *kernel) {{
-
-          uint32_t sum = 0;
-
-          #pragma GCC unroll {tensor_h}
-          for (int i = 0; i < {tensor_h}; i++) {{
-            #pragma GCC unroll {tensor_w // simd_lanes}
-            for (int j = 0; j < {tensor_w // simd_lanes}; j++) {{
-              uint32_t tensor_batch = *tensor++;
-              uint32_t kernel_batch = *kernel++;
-              {inner_loop.strip()}
-            }}
-            tensor += {jump // simd_lanes};
-          }}
-          out[0] = sum;
+        for i in range(num_packed):
+            yield f"output[{offset + i}] = packed_res_{i};"
+
+        if (offset + num_outputs) % 2 == 1:
+            yield f"((int16_t*) output)[{num_packed * 2}] = (int16_t) requant_{num_packed * 2};"
+
+
+def tensordot_int16_impl(
+    num_outputs: int,
+    dimensions: Tuple[int, int, int],
+    offsets: Tuple[int, int, int],
+    x_strides: Tuple[int, int],
+    requantize_shift: int = 33,
+    output_zero_point: int = -128,
+) -> Tuple[str, str]:
+    """Generates code to compute a tensor dot product with requantization.
+
+    The generated function takes pointers to the output, tensor, and kernel as input. All pointers
+    must be word aligned. Only works with `int16` data type. The generated code is optimized for the
+    ARMv7E-M architecture.
+
+    Parameters
+    ----------
+    num_outputs: int
+        The number of tensordot outputs to compute per function call. Computing more than one at
+        once makes us much faster by reducing how often overlapping data is loaded. However, setting
+        this too high causes us to run out of registers and need to store data on the stack. We
+        should autotune this, but num_outputs=2 is usually OK.
+
+    dimensions: Tuple[int, int, int]
+        The dimensions of each tensordot operation. dimensions[1] and dimensions[2] are the height
+        and width of the kernel, respectively. dimensions[0] is the width of the data tensor, which
+        is usually larger than the kernel.
+
+    offsets: Tuple[int, int, int]
+        Each value is 0 or 1, and represents how far after the given data, kernel, and output
+        pointers (respectively) we should start reading/writing. This prevents us from having to
+        check if each pointer is aligned or unaligned at runtime, making us faster.
+
+    x_strides: Tuple[int, int]
+        The distance (in halfwords) between the start of each input tensor, and where to write each
+        output result respectively. Only used when num_outputs > 1.
+
+    requantize_shift: int
+        The distance to right shift after multiplying by the requantization scale. Defaults to 33,
+        as this lets us skip a shift operation.
+
+    outout_zero_point: int
+        The output zero point, which will be subtracted after scale multiplication but before
+        clipping. Defaults to -128, as most models always use this.
+
+    Returns
+    -------
+    func_name, func_code: Tuple[str, str]
+        The name and source code of the generated function.
+    """
+    function_name = _get_c_function_name(num_outputs, dimensions, offsets, x_strides)
+    tensor_w, kernel_h, kernel_w = dimensions
+    tensor_offset, kernel_offset, output_offset = offsets
+    assert tensor_offset < 2 and kernel_offset < 2 and output_offset < 2
+    in_stride, out_stride = x_strides
+
+    tensor_halfwords = _get_tensor_halfwords(dimensions, tensor_offset, num_outputs, in_stride)
+    kernel_halfwords = _get_kernel_halfwords(dimensions, kernel_offset)
+    load_tensor_lines = _load_tensor_vars(tensor_halfwords, tensor_w)
+    load_kernel_lines = _load_kernel_vars(kernel_halfwords)
+
+    def gen_single_loop_macs(index):
+        draft_macs_iter = _get_draft_macs(
+            (kernel_h, kernel_w), tensor_halfwords, kernel_halfwords, index * in_stride
+        )
+        draft_macs_iter = _apply_simd_optimizations(draft_macs_iter)
+        return _expand_instruction_tuples(draft_macs_iter, index)
+
+    multiply_acc_lines = chain.from_iterable(gen_single_loop_macs(i) for i in range(num_outputs))
+    requantize_lines = _requantize_sums(
+        num_outputs, requantize_shift=requantize_shift, output_zero_point=output_zero_point
+    )
+    write_out_lines = _write_sums_to_memory(num_outputs, output_offset, out_stride)
+
+    def insert_lines(lines):
+        return ("\n" + " " * 10).join(lines)
+
+    # It's very common for one model to have different layers that use identical tensordot
+    # functions. To prevent function re-definition errors, we need an #include guard. This is better
+    # than adding a random suffix, as it saves flash memory.
+    code = textwrap.dedent(
+        f"""
+        #ifndef {function_name.upper()}_EXISTS
+        #define {function_name.upper()}_EXISTS
+        #include <arm_acle.h>
+        __attribute__((always_inline)) static inline int32_t {function_name}(
+            int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+        ) {{
+          {_init_biased_accumulators(num_outputs)}
+
+          {insert_lines(load_tensor_lines)}
+
+          {insert_lines(load_kernel_lines)}
+
+          {insert_lines(multiply_acc_lines)}
+
+          {insert_lines(requantize_lines)}
+
+          {insert_lines(write_out_lines)}
           return 0;
         }}
+        #endif
         """
-        )
     )
+    return (function_name, code)
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py b/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
deleted file mode 100644
index 79564f98edfc..000000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Implementations of several conv2d variations, all tensorized using tensordot and optimized for
-Cortex-M DSP. Currently contains a standard conv2d and depthwise conv2d implementation, but could be
-extended to add a grouped conv2d operator. Due to the way we tensorize, this schedule ONLY works
-when the data and kernel layouts are NCHWxc and OIHWxi respectively, where x is the number of
-input channels divided by the number of groups."""
-
-import random
-import string
-from typing import Callable, Tuple, Union
-
-import tvm
-from tvm import te
-from tvm.tir import indexdiv, indexmod
-from tvm.topi.utils import traverse_inline
-from tvm.topi.nn.pad import pad
-
-from .micro_kernel.tensordot import (
-    make_intrin_tensordot,
-    tensordot_impl,
-)
-
-
-def _unpack_2d_argument(argument: Union[int, Tuple]) -> Tuple:
-    if isinstance(argument, int):
-        return (argument, argument)
-    assert len(argument) == 2
-    return argument
-
-
-def _check_no_dilation(dilation: Union[int, Tuple]) -> None:
-    """Takes a dilation argument as an integer or tuple, and makes sure both dimensions are 1.
-    Dilation prevents us from using DSP instructions, so this schedule can't work (aside from the
-    niche case where dilation_h == stride_h and dilation_w == stride_w, which is rare enough we
-    probably don't need to support it)."""
-
-    dilation_h, dilation_w = _unpack_2d_argument(dilation)
-    assert dilation_h == dilation_w == 1
-
-
-def _unpack_padding(padding: Tuple) -> Tuple:
-    assert isinstance(padding, tuple)
-    if len(padding) == 2:
-        (pad_up, pad_down), (pad_left, pad_right) = padding
-    else:
-        pad_up, pad_left, pad_down, pad_right = padding
-    return pad_up, pad_left, pad_down, pad_right
-
-
-def _pad_if_needed(data: te.tensor.Tensor, layout: str, padding: Tuple) -> te.tensor.Tensor:
-    """Performs padding on a te.tensor.Tensor object if necessary. If padding = (0, 0, 0, 0), the
-    input tensor is returned unmodified. We only care about tuples here - "VALID" and "SAME" padding
-    will be converted by the importer TFLite importer if present."""
-
-    pad_up, pad_left, pad_down, pad_right = padding
-    if not any(padding):
-        return data
-
-    # We want to pad the "H" and "W" columns, and their position depends on the layout
-    pad_before, pad_after = [0, 0, 0, 0], [0, 0, 0, 0]
-    pad_before[layout.index("H")] = pad_up
-    pad_before[layout.index("W")] = pad_left
-    pad_after[layout.index("H")] = pad_down
-    pad_after[layout.index("W")] = pad_right
-    return pad(data, pad_before, pad_after, name="padded_data")
-
-
-def _compute_output_dim(
-    data_dim: int, kernel_dim: int, pad_before: int, pad_after: int, stride: int
-) -> int:
-    """Computes an output dimension of a convolution, given the data dimension, kernel dimension,
-    padding, and stride along that axis. Note that when stride > 1, this division will often not
-    be perfectly even."""
-    return (data_dim + pad_before + pad_after - kernel_dim) // stride + 1
-
-
-def _wrap_te_compute(
-    shape: Tuple,
-    fcompute: Callable[[int, int, int, int], tvm.ir.PrimExpr],
-    desired_out_layout: str,
-    current_out_layout: str = "NHWC",
-    **kwargs,
-) -> te.tensor.Tensor:
-    """Wrapper over te.compute that allows the output layout to be easily changed."""
-    assert current_out_layout.isalpha() and desired_out_layout.isalpha()
-    assert sorted(current_out_layout) == sorted(desired_out_layout)
-    forward_order = (current_out_layout.index(c) for c in desired_out_layout)
-    reverse_order = (desired_out_layout.index(c) for c in current_out_layout)
-
-    return te.compute(
-        tuple(shape[i] for i in forward_order),
-        lambda *args: fcompute(*(args[i] for i in reverse_order)),
-        **kwargs,
-    )
-
-
-def _get_suffix() -> str:
-    """Returns a random eight-character string to append to C function names. Prevents accidental
-    re-definition of functions if the same operator appears twice in a Relay graph."""
-    return "".join(random.choices(string.ascii_uppercase, k=8))
-
-
-def conv2d_nhwc_ohwi_dsp_compute(
-    _cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-):
-    """Standard conv2d schedule that can be tensorized using tensordot."""
-
-    stride_h, stride_w = _unpack_2d_argument(strides)
-    pad_up, pad_left, pad_down, pad_right = _unpack_padding(padding)
-    _check_no_dilation(dilation)
-
-    batch_size, data_h, data_w, in_channels = data.shape
-    output_channels, kernel_h, kernel_w, _ = kernel.shape
-    assert kernel.shape[3] == in_channels
-
-    output_h = _compute_output_dim(data_h, kernel_h, pad_up, pad_down, stride_h)
-    output_w = _compute_output_dim(data_w, kernel_w, pad_left, pad_right, stride_w)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-    kc_i = te.reduce_axis((0, in_channels), name="rc")
-
-    padded_data = _pad_if_needed(data, "NHWC", (pad_up, pad_left, pad_down, pad_right))
-    return _wrap_te_compute(
-        (batch_size, output_h, output_w, output_channels),
-        lambda n, y, x, c: te.sum(
-            padded_data[n, y * stride_h + kh_i, x * stride_w + kw_i, kc_i].astype(out_dtype)
-            * kernel[c, kh_i, kw_i, kc_i].astype(out_dtype),
-            axis=(kh_i, kw_i, kc_i),
-        ),
-        out_layout,
-        name="conv2d",
-        tag="conv2d_nhwc_ohwi_dsp",
-    )
-
-
-def _make_conv2d_tensorization(padded_data, kernel):
-    _, _, padded_w, in_channels = padded_data.shape
-    _, kernel_h, kernel_w, _ = kernel.shape
-    in_dtype = padded_data.dtype
-    suffix = _get_suffix()
-    assert in_dtype == kernel.dtype
-
-    data_slice = te.placeholder((kernel_h, kernel_w, in_channels), name="a", dtype=in_dtype)
-    kernel_slice = te.placeholder((kernel_h, kernel_w, in_channels), name="b", dtype=in_dtype)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-    kc_i = te.reduce_axis((0, in_channels), name="kc_i")
-
-    output_slice = te.compute(
-        (1,),
-        lambda k: te.sum(
-            data_slice[kh_i, kw_i, kc_i].astype("int32")
-            * kernel_slice[kh_i, kw_i, kc_i].astype("int32"),
-            axis=[kh_i, kw_i, kc_i],
-        ),
-        name="c",
-    )
-
-    # TVM has a really strange bug where the outer reduction axis (kh_i) having length 1 causes the
-    # decl_buffer strides check to fail. height_stride is a dark magic workaround for this.
-    height_stride = in_channels * padded_w if kernel_h > 1 else in_channels
-    jump = (padded_w - kernel_w) * in_channels
-    tensordot_params = (in_dtype, kernel_h, jump, kernel_w * in_channels, suffix)
-    intrin_tensordot = make_intrin_tensordot(
-        (data_slice, kernel_slice, output_slice),
-        ([height_stride, in_channels, 1], [kernel_w * in_channels, in_channels, 1]),
-        tensordot_params,
-    )
-
-    tensordot_code = tensordot_impl(*tensordot_params)
-    return (intrin_tensordot, tensordot_code)
-
-
-def depthwise_conv2d_nchw_oihw_dsp_compute(
-    _cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-):
-    """Depthwise conv2d schedule that can be tensorized using tensordot."""
-
-    stride_h, stride_w = _unpack_2d_argument(strides)
-    pad_up, pad_left, pad_down, pad_right = _unpack_padding(padding)
-    _check_no_dilation(dilation)
-
-    batch_size, in_channels, data_h, data_w = data.shape
-    _, c_mul, kernel_h, kernel_w = kernel.shape
-    output_channels = in_channels * c_mul
-    assert kernel.shape[0] == in_channels
-
-    output_h = _compute_output_dim(data_h, kernel_h, pad_up, pad_down, stride_h)
-    output_w = _compute_output_dim(data_w, kernel_w, pad_left, pad_right, stride_w)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-
-    padded_data = _pad_if_needed(data, "NCHW", (pad_up, pad_left, pad_down, pad_right))
-    return _wrap_te_compute(
-        (batch_size, output_h, output_w, output_channels),
-        lambda n, y, x, c: te.sum(
-            padded_data[
-                n,
-                indexdiv(c, c_mul),
-                y * stride_h + kh_i,
-                x * stride_w + kw_i,
-            ].astype(out_dtype)
-            * kernel[indexdiv(c, c_mul), indexmod(c, c_mul), kh_i, kw_i].astype(out_dtype),
-            axis=(kh_i, kw_i),
-        ),
-        out_layout,
-        name="depthwise_conv2d",
-        tag="depthwise_conv2d_nchw_oihw_dsp",
-    )
-
-
-def _make_depthwise_conv2d_tensorization(padded_data, kernel):
-    _, _, _, padded_w = padded_data.shape
-    _, _, kernel_h, kernel_w = kernel.shape
-
-    in_dtype = padded_data.dtype
-    suffix = _get_suffix()
-    assert in_dtype == kernel.dtype
-
-    data_slice = te.placeholder((kernel_h, kernel_w), name="a", dtype=in_dtype)
-    kernel_slice = te.placeholder((kernel_h, kernel_w), name="b", dtype=in_dtype)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-
-    output_slice = te.compute(
-        (1,),
-        lambda k: te.sum(
-            data_slice[kh_i, kw_i].astype("int32") * kernel_slice[kh_i, kw_i].astype("int32"),
-            axis=[kh_i, kw_i],
-        ),
-        name="c",
-    )
-
-    jump = padded_w - kernel_w
-    tensordot_params = (in_dtype, kernel_h, jump, kernel_w, suffix)
-    intrin_tensordot = make_intrin_tensordot(
-        (data_slice, kernel_slice, output_slice),
-        ([padded_w, 1], [kernel_w, 1]),
-        tensordot_params,
-    )
-
-    tensordot_code = tensordot_impl(*tensordot_params)
-    return (intrin_tensordot, tensordot_code)
-
-
-def tensordot_conv2ds_schedule(_cfg, outs):
-    """Schedule function using v7e-m DSP instructions for all the conv2d operators in this file. We
-    use one schedule function for them all, because they are tensorized with the same kernel."""
-
-    schedule = te.create_schedule([x.op for x in outs])
-
-    def _callback(operator):
-        if "conv2d" in operator.tag:
-            output = operator.output(0)
-            padded_data = output.op.input_tensors[0]
-            kernel = output.op.input_tensors[1]
-
-            if operator.tag == "conv2d_nhwc_ohwi_dsp":
-                b_ax, y_ax, x_ax, co_ax = schedule[output].op.axis
-                kh_ax, kw_ax, ci_ax = schedule[output].op.reduce_axis
-                schedule[output].reorder(b_ax, y_ax, x_ax, co_ax, kh_ax, kw_ax, ci_ax)
-                intrin, code = _make_conv2d_tensorization(padded_data, kernel)
-
-            elif operator.tag == "depthwise_conv2d_nchw_oihw_dsp":
-                b_ax, y_ax, x_ax, co_ax = schedule[output].op.axis
-                kh_ax, kw_ax = schedule[output].op.reduce_axis
-                schedule[output].reorder(b_ax, co_ax, y_ax, x_ax, kh_ax, kw_ax)
-                intrin, code = _make_depthwise_conv2d_tensorization(padded_data, kernel)
-
-            else:
-                raise ValueError(f"Cannot tensorize {operator.tag} with tensordot!")
-
-            schedule[output].tensorize(kh_ax, intrin)
-            schedule[output].pragma(b_ax, "import_c", code)
-
-    traverse_inline(schedule, outs[-1].op, _callback)
-    return schedule
diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
new file mode 100644
index 000000000000..fad64cc09bb8
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/qnn.py
@@ -0,0 +1,370 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Contains TVMScript implementations of some QNN operators for Arm.
+
+Currently, the only ops with compute functions are fused regular and depthwise convolutions for
+Arm Cortex-M with DSP.
+"""
+
+from typing import Tuple
+
+import tvm
+from tvm import te
+from tvm.tir import const
+from tvm.script import tir as T
+from ..utils import get_const_tuple
+from .mprofile.dsp.micro_kernel import tensordot
+
+
+def int_ceil_division(x, y):
+    return -(x // -y)
+
+
+def _compute_output_dim(data_length, kernel_length, stride):
+    return int_ceil_division(data_length + 1 - kernel_length, stride)
+
+
+def _pick_tensordot_impl(attrs, inputs, num_outputs=2, is_depthwise=False):
+    """Helper function that chooses the right implementation of micro_kernel.tensordot.
+
+    Takes as input the parameters of the conv2d, and returns a tuple of TWO (function_name,
+    function_code). The first pair (the aligned one) is for even numbered output channels, and the
+    second pair (the offset one) is for odd-numbered output channels. This function is used for
+    regular and depthwise convolutions.
+
+    We need different implementations for even vs odd numbered output channels, because the "start"
+    of an odd output channel in the data tensor or kernel might or might not be on a word boundary,
+    and the tensordot code expects all input pointers to be word-aligned.
+    """
+    data, kernel = inputs[0:2]
+    rq_output_zero_point_const = inputs[10]
+    assert len(rq_output_zero_point_const.op.body) == 1
+    output_zero_point = rq_output_zero_point_const.op.body[0]
+
+    _, stride_w = get_const_tuple(attrs.strides)
+
+    if is_depthwise:
+        assert attrs.data_layout == "NCHW"
+        assert attrs.kernel_layout == "IOHW"
+        _, _, height, width = get_const_tuple(data.shape)
+        _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
+
+        dimensions = (width, kernel_h, kernel_w)
+        in_stride = stride_w
+        data_per_oc_size = height * width
+    else:
+        assert attrs.data_layout == "NHWC"
+        assert attrs.kernel_layout == "OHWI"
+        _, height, width, in_channels = get_const_tuple(data.shape)
+        out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
+
+        dimensions = (width * in_channels, kernel_h, kernel_w * in_channels)
+        in_stride = in_channels * stride_w
+        data_per_oc_size = 0
+
+    assert attrs.out_layout is not None
+    if attrs.out_layout == "NHWC":
+        out_stride = out_channels
+    elif attrs.out_layout == "NCHW":
+        out_stride = 1
+    else:
+        raise ValueError(f"Unsupported output layout {attrs.out_layout}!")
+
+    x_strides = (in_stride, out_stride)
+    aligned_func = tensordot.tensordot_int16_impl(
+        num_outputs,
+        dimensions,
+        (0, 0, 0),
+        x_strides,
+        output_zero_point=output_zero_point,
+    )
+
+    kernel_per_oc_size = dimensions[1] * dimensions[2]
+
+    offsets = (data_per_oc_size % 2, kernel_per_oc_size % 2, 0)
+    offset_func = tensordot.tensordot_int16_impl(
+        num_outputs,
+        dimensions,
+        offsets,
+        x_strides,
+        output_zero_point=output_zero_point,
+    )
+
+    return (aligned_func, offset_func)
+
+
+def _make_tscript_ptr(buffer, offset, length, dtype="int16"):
+    return T.tvm_access_ptr(
+        T.type_annotation(dtype=dtype),
+        buffer.data,
+        offset,
+        length,
+        1,
+        dtype="handle",
+    )
+
+
+def _make_tscript_call(func_name, *args):
+    return T.evaluate(T.call_extern(func_name, *args, dtype="int32"))
+
+
+def _make_conv2d_primfunc(
+    call_dimensions: Tuple,
+    buffer_shapes: Tuple[Tuple, Tuple, Tuple, Tuple, Tuple],
+    aligned_func: Tuple[str, str],
+    offset_func: Tuple[str, str],
+    ptr_gens: Tuple,
+):
+    height, width, out_channels = call_dimensions
+    data_shape, kernel_shape, bias_shape, scale_shape, output_shape = buffer_shapes
+    aligned_func_name, aligned_func_code = aligned_func
+    offset_func_name, offset_func_code = offset_func
+    output_ptr, data_ptr, kernel_ptr = ptr_gens
+
+    # If the functions are identical, we can skip the second loop
+    if aligned_func_name == offset_func_name:
+        aligned_channels = out_channels
+        offset_channels = tvm.tir.const(0)
+        c_step = tvm.tir.const(1)
+    else:
+        aligned_channels = out_channels // 2
+        offset_channels = out_channels // 2
+        c_step = tvm.tir.const(2)
+
+    def bias_ptr(bias, c):
+        return _make_tscript_ptr(bias, c, 1, dtype="int32")
+
+    def scale_ptr(scale, c):
+        return _make_tscript_ptr(scale, c, 1, dtype="int32")
+
+    @T.prim_func
+    def biased_quantized_conv2d(
+        data_handle: T.handle,
+        kernel_handle: T.handle,
+        bias_handle: T.handle,
+        scale_handle: T.handle,
+        output_handle: T.handle,
+    ) -> None:
+
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        data = T.match_buffer(data_handle, data_shape, dtype="int16")
+        kernel = T.match_buffer(kernel_handle, kernel_shape, dtype="int16")
+        bias = T.match_buffer(bias_handle, bias_shape, dtype="int32")
+
+        # We don't specify a data type for the requantization scale, even though we will read it as
+        # an int32. This is because we must pretend it is a float32, as Relay's requantize op only
+        # allows floating point scales.
+        scale = T.match_buffer(scale_handle, scale_shape)
+        output = T.match_buffer(output_handle, output_shape, dtype="int16")
+
+        # This hack prevents TVM from seeing these variables as "unused". I should be using T.reads
+        # and T.writes, but they don't work. I think it's an issue with BufferTouchedDomain.
+        # pylint: disable=unused-variable
+        output[0, 0, 0, 0] = 0
+        __1 = data[0, 0, 0, 0]
+        __2 = kernel[0, 0, 0, 0]
+        __3 = bias[0, 0, 0, 0]
+        __4 = scale[0]
+        # pylint: enable=unused-variable
+
+        for c_ax, y_ax, x_ax in T.grid(aligned_channels, height, width):
+            with T.block("conv2d_aligned"):
+                T.block_attr({"pragma_import_c": aligned_func_code})
+                y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                _make_tscript_call(
+                    aligned_func_name,
+                    output_ptr(output, y, x, c * c_step),
+                    data_ptr(data, y, x, c * c_step),
+                    kernel_ptr(kernel, c * c_step),
+                    bias_ptr(bias, c * c_step),
+                    scale_ptr(scale, c * c_step),
+                )
+
+        for c_ax, y_ax, x_ax in T.grid(offset_channels, height, width):
+            with T.block("conv2d_offset"):
+                T.block_attr({"pragma_import_c": offset_func_code})
+                y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                _make_tscript_call(
+                    offset_func_name,
+                    output_ptr(output, y, x, c * c_step + 1),
+                    data_ptr(data, y, x, c * c_step + 1, offset=1),
+                    kernel_ptr(kernel, c * c_step + 1, offset=1),
+                    bias_ptr(bias, c * c_step + 1),
+                    scale_ptr(scale, c * c_step + 1),
+                )
+
+    return biased_quantized_conv2d
+
+
+def qnn_conv2d(attrs, inputs, out_type):
+    """Compute for qnn.conv2d with NHWC layout.
+
+    Note that this is a DIFFERENT layout from the Hexagon variant, because they have special
+    instructions Cortex-M doesn't have. We expect the kernel to have OHWI layout. We also assume
+    that padding is not necessary, as it will have been done by another pass.
+    """
+
+    # Make a few checks to unpack the function arguments and ensure it was called with the right
+    # arguments. Note that unlike most schedules, qnn_conv2d does not use a wrapper.
+    assert len(inputs) == 11
+    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
+    output_layout = attrs.out_layout
+    assert output_layout == "NHWC"
+
+    _, height, width, in_channels = get_const_tuple(data.shape)
+    out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
+    y_stride, x_stride = get_const_tuple(attrs.strides)
+
+    out_height = _compute_output_dim(height, kernel_h, y_stride)
+    out_width = _compute_output_dim(width, kernel_w, x_stride)
+
+    # Decide how many sums our function should have running at the same time. Doing
+    # this lets us do "more work" for each memory load, but doing too many of them causes us to run
+    # out of registers. Currently this is set to either 1 or 2, but autotuning this value would
+    # improve performance a lot. Tracked by https://github.com/apache/tvm/issues/13528.
+
+    num_outputs = 2
+
+    # Next, decide whether whether we need "parity alternation". For example, if we have an
+    # 8x3x3x3 kernel (8 output channels, height 3, width 3, input channels 3) in the OHWI layout,
+    # then every output channel kernel slice will be 27 halfwords. This means every other output
+    # channel will not be word aligned, which will cause slowness/crashes!
+
+    # We solve this problem by handling the "aligned" and "offset" output channels with different
+    # versions of our tensordot function. The "aligned func" assumes that the start positions of the
+    # output, data, and kernel are given exactly by their pointer. The "offset" version assumes that
+    # the "true" start of the output is the value in the output pointer, plus an offset of 0 or 1.
+    # _pick_tensordot_impl decides whether this is the case. If not, we only want to generate one
+    # function (to save flash), so offset_func is a tuple of empty strings.
+
+    aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, False)
+
+    # Helper functions to make pointers
+    def output_ptr(buffer, y, x, c):
+        return _make_tscript_ptr(
+            buffer,
+            y * const(out_width * out_channels) + x * const(out_channels * num_outputs) + c,
+            1,
+        )
+
+    # We need to disable pylint's unused argument checker, as the kwarg offset is unused but must
+    # be present for compatibility. We cannot add an underscore as we normally would, as this makes
+    # the keyword not match.
+
+    # pylint: disable=unused-argument
+    def data_ptr(buffer, y, x, c, offset=0):
+        return _make_tscript_ptr(
+            buffer,
+            y * const(y_stride * width * in_channels)
+            + x * const(x_stride * num_outputs * in_channels),
+            1,
+        )
+
+    # pylint: enable=unused-argument
+
+    def kernel_ptr(buffer, c, offset=0):
+        return _make_tscript_ptr(
+            buffer,
+            c * const(kernel_h * kernel_w * in_channels) - offset,
+            1,
+        )
+
+    prim_func = _make_conv2d_primfunc(
+        (const(out_height), const(out_width // num_outputs), const(out_channels)),
+        (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
+        aligned_func,
+        offset_func,
+        (output_ptr, data_ptr, kernel_ptr),
+    )
+
+    output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16")
+    return [output]
+
+
+def schedule_qnn_conv2d(_attrs, _outs, _target):
+    """Schedule function for qnn.conv2d."""
+    return None
+
+
+def qnn_depthwise_conv2d(attrs, inputs, out_type):
+    """Compute for qnn.depthwise_conv2d with NCHW layout.
+
+    Works basically the same way as regular conv2d - see above.
+    """
+
+    assert len(inputs) == 11
+    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
+    output_layout = attrs.out_layout
+    assert output_layout == "NHWC"
+
+    _, _, height, width = get_const_tuple(data.shape)
+    _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
+    _, out_height, out_width, _ = get_const_tuple(out_type.shape)
+    y_stride, x_stride = get_const_tuple(attrs.strides)
+
+    out_height = _compute_output_dim(height, kernel_h, y_stride)
+    out_width = _compute_output_dim(width, kernel_w, x_stride)
+
+    num_outputs = 2
+
+    aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, True)
+
+    # Helper functions for making pointers.
+    def output_ptr(buffer, y, x, c):
+        return _make_tscript_ptr(
+            buffer,
+            y * const(out_width * out_channels) + x * const(out_channels * num_outputs) + c,
+            1,
+        )
+
+    def data_ptr(buffer, y, x, c, offset=0):
+        if height * width % 2 == 1:
+            x_ptr_offset = tvm.tir.const(-1)
+        else:
+            x_ptr_offset = tvm.tir.const(0)
+
+        return _make_tscript_ptr(
+            buffer,
+            c * const(width * height)
+            + y * const(y_stride * width)
+            + x * const(x_stride * num_outputs)
+            + offset * x_ptr_offset,
+            1,
+        )
+
+    def kernel_ptr(buffer, c, offset=0):
+        return _make_tscript_ptr(
+            buffer,
+            c * tvm.tir.const(kernel_h * kernel_w) - offset,
+            1,
+        )
+
+    prim_func = _make_conv2d_primfunc(
+        (const(out_height), const(out_width // num_outputs), const(out_channels)),
+        (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
+        aligned_func,
+        offset_func,
+        (output_ptr, data_ptr, kernel_ptr),
+    )
+
+    output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16")
+    return [output]
+
+
+def schedule_qnn_depthwise_conv2d(_attrs, _outs, _target):
+    """Schedule function for qnn.depthwise_conv2d."""
+    return None
diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py
new file mode 100644
index 000000000000..00225493db96
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm Cortex-M specific optimizations for quantized operators."""
+
+import numpy as np
+
+from tvm import nd, relay, target
+from ..nn import qnn_requantize_alter_layout, qnn_add_alter_layout
+
+
+@qnn_requantize_alter_layout.register(["arm_cpu"])
+def alter_requantize_layout(attrs, inputs, _tinfos, _out_type):
+    """Changes a floating point requantize op to use int64 multiply + shift for microTVM.
+
+    Usually, this is done by QNN legalization. However, microTVM wants to manually choose the
+    integer rounding constants in order to:
+        (a) Have int32, not int64 constants
+        (b) Use a constant rounding shift to skip a memory load.
+
+    Ideally, we would pick these constants in the requantize (or fused) schedule. Unfortunately that
+    is not currently possible, so we pick them with `alter_layout` as a hack. This will only work if
+    the requantize schedule "plays along" with this hack.
+    """
+
+    # Only microTVM Cortex-M boards with DSP use the relevant schedules
+    current_target = target.Target.current(allow_none=False)
+    if not (current_target.features.has_dsp and "cortex-m" in current_target.mcpu):
+        return None
+
+    _, in_scale, _, out_scale, _ = inputs
+    in_scale_numpy = in_scale.data.numpy().astype("float64")
+    out_scale_scalar = out_scale.data.numpy().item()
+
+    # Shifting by 33 and rounding means shifting by 32, adding 1, and shifting by 1 again. This is
+    # useful, because shifting a multiplication product by 32 can be done for "free" with SMMUL
+    scales = ((in_scale_numpy / out_scale_scalar) * 2**33).astype("int32")
+
+    # Requantize ops in Relay do not support int32 scales - if we try to use one, requantize.cc will
+    # raise an error. As a hacky work-around, we change the scale dtype to float32, without changing
+    # underlying data. This works, as our compute function knows to interpret the scale as an int32.
+
+    # This is only a work-around - a better long-term solution would be adding a new integer
+    # requantize op, which takes integer scales, shifts, and rounding behavior.
+    fake_float_scales = scales.view("float32")
+
+    scale_constant = relay.Constant(nd.array(fake_float_scales))
+    return relay.qnn.op.requantize(inputs[0], scale_constant, *inputs[2:], **attrs)
+
+
+def _is_qnn_op_depthwise_conv2d(qnn_conv2d_op):
+    return relay.op.strategy.generic.is_depthwise_conv2d(
+        qnn_conv2d_op.args[0].type_annotation.shape,
+        qnn_conv2d_op.attrs.data_layout,
+        qnn_conv2d_op.args[1].data.shape,
+        qnn_conv2d_op.attrs.kernel_layout,
+        qnn_conv2d_op.attrs.groups,
+    )
+
+
+@qnn_add_alter_layout.register(["arm_cpu"])
+def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
+    """Fuses the zero point for a previous quantized operator with this add operation.
+
+    Currently only supports qnn.conv2d, but qnn.dense support should be added. Note that this
+    optimization means we must pad tensors with the input zero point, and NOT with zero.
+    """
+
+    prev_op, biases = inputs
+    if not hasattr(prev_op, "op"):
+        return None
+    if prev_op.op.name != "qnn.conv2d":
+        return None
+
+    # We should not perform this alteration if the target has a uint * int SIMD MAC operation (since
+    # these do (x - (-128)) * y efficiently, and conv_input_zp is usually -128). For now, we
+    # restrict this optimization to just Cortex-M devices, but it might be helpful on others too.
+    current_target = target.Target.current(allow_none=False)
+    if not "cortex-m" in current_target.mcpu:
+        return None
+
+    conv_input_zp = prev_op.args[2].data.numpy().item()
+    kernel = prev_op.args[1].data.numpy()
+
+    if _is_qnn_op_depthwise_conv2d(prev_op):
+        axes_to_sum = "HW"
+    elif prev_op.attrs.groups == 1:
+        axes_to_sum = "HWI"
+    else:
+        # This alteration does not currently support grouped conv2d
+        return None
+    axes_to_sum = tuple(map(prev_op.attrs.kernel_layout.index, axes_to_sum))
+    element_sums = np.sum(kernel, axis=axes_to_sum).flatten()
+
+    # The zero point is subtracted from the input elements, so we need a "-" sign here
+    zp_shifted_sums = element_sums * (-conv_input_zp)
+
+    # We want to make sure new_biases is representable as an int32. It's tempting to just check
+    # whether arr.dtype == "int32" (since Numpy will automatically increase dtype in some cases)
+    # but this leads to weird wrapping behavior and doesn't work. We must do it manually.
+    new_biases = biases.data.numpy().astype("int64") + zp_shifted_sums
+    if new_biases.min() < -(2**31) or new_biases.max() > 2**31 - 1:
+        return None
+
+    new_input_zp = relay.Constant(nd.array(np.int32(0)))
+    new_conv_args = (*prev_op.args[:2], new_input_zp, *prev_op.args[3:])
+    new_conv_op = relay.qnn.op.conv2d(*new_conv_args, **prev_op.attrs)
+    bias_constant = relay.Constant(nd.array(new_biases.astype("int32")))
+    return relay.add(new_conv_op, bias_constant)
diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py
index caed28580037..222f7a7c223e 100644
--- a/python/tvm/topi/nn/qnn.py
+++ b/python/tvm/topi/nn/qnn.py
@@ -188,3 +188,51 @@ def _dispatch_sim_dequantize(value):
         return intn_value
 
     return te.compute(data.shape, lambda *indices: _dispatch_sim_dequantize(data)[indices])
+
+
+@tvm.target.generic_func
+def qnn_requantize_alter_layout(_attrs, _inputs, _tinfos, _out_type):
+    """Change requantize layout.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level and operator level.
+    """
+    return None
+
+
+@tvm.target.generic_func
+def qnn_add_alter_layout(_attrs, _inputs, _tinfos, _out_type):
+    """Change add layout.
+
+    Add is not a QNN-specific function, but this generic exists so that bias add operations can be
+    fused with input zero point add optimizations, which only happens if the previous operator is
+    quantized.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level and operator level.
+    """
+    return None
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index 64a5a02e6e25..2170ba76e060 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -53,8 +53,9 @@ bool QnnConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8) ||
          data->dtype == DataType::Int(16))
       << "Expected qnn conv2d type(int8, uint8, int16) for input but was " << data->dtype;
-  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
-      << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
+  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8) ||
+         weight->dtype == DataType::Int(16))
+      << "Expected qnn conv2d type(int8, uint8, int16) for weight but was " << weight->dtype;
   ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32) ||
          param->out_dtype == DataType::Int(64))
       << "Expected qnn conv2d type(int16, int32, int64) for output but was " << param->out_dtype;
diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
index a3e48f4424ad..58173a9ea6c3 100644
--- a/tests/python/contrib/test_ethosn/test_convert_equivalents.py
+++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
@@ -120,7 +120,7 @@ def expected():
 @requires_ethosn
 @pytest.mark.parametrize(
     "dtype,shape,constant_shape",
-    [("int8", (1, 4, 4), (4,)), ("int16", (1, 16, 12, 4), (1, 1, 1, 4))],
+    [("int8", (1, 4, 4), (4,)), ("int32", (1, 16, 12, 4), (1, 1, 1, 4))],
 )
 def test_unsupported_multiply_to_depthwise(dtype, shape, constant_shape):
     """Check that unsupported variants of multiply to depthwise are not converted."""
@@ -339,7 +339,7 @@ def visit_call(self, call):
 
 @requires_ethosn
 @pytest.mark.parametrize(
-    "dtype,lhs_shape,rhs_shape", [("uint8", (1, 4, 4), (1, 1, 4)), ("int16", (1, 4, 4, 4), (4,))]
+    "dtype,lhs_shape,rhs_shape", [("uint8", (1, 4, 4), (1, 1, 4)), ("int32", (1, 4, 4, 4), (4,))]
 )
 def test_unsupported_add_to_depthwise(dtype, lhs_shape, rhs_shape):
     """Check that unsupported variants of add are not converted."""
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_conv2d.py
index 6cf4bbb8e6ed..1b9c1a5e2e94 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d.py
@@ -93,28 +93,6 @@ class TestConv2d_NHWC_Spatial_Pack(Conv2dTests):
     schedule_name = parameter("conv2d_nhwc_spatial_pack.arm_cpu")
 
 
-class TestConv2d_Tensordot(Conv2dTests):
-    """This test is for the regular conv2d schedule tensorized using tensordot."""
-
-    data_shape, kernel_size, num_filter, strides, padding = parameters(
-        # Disabled because these kernels are not an integral number of words
-        # ((1, 32, 32, 1), (3, 3), 12, 1, 0),
-        # ((1, 32, 10, 3), (3, 3), 16, 1, 0),
-        # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1)),
-        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0)),
-        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0),
-        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1)),
-        ((4, 16, 16, 16), (5, 5), 8, 2, 0),
-    )
-    dilation = parameter(1)
-    in_dtype = parameter("int8", "int16", "int32")
-
-    data_layout = parameter("NHWC")
-    kernel_layout = parameter("OHWI")
-    out_layout = parameter("NHWC", "NCHW")
-    schedule_name = parameter("conv2d_nhwc_ohwi_dsp.arm_cpu")
-
-
 class TestConv2d_NCHW_Spatial_Pack(Conv2dTests):
     """This test is for conv2d_nchw_spatial_pack.arm_cpu schedule."""
 
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index f45d27bdaee9..95ae105f9166 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -110,36 +110,5 @@ class TestDepthwiseConv2d_NHWC_HWOI_DSP(DepthwiseConv2dTests):
     schedule_name = parameter("depthwise_conv2d_nhwc_dsp.arm_cpu")
 
 
-class TestDepthwiseConv2d_Tensordot(DepthwiseConv2dTests):
-    """This test is for the depthwise_conv2d schedule tensorized using tensordot."""
-
-    data_shape, kernel_size, num_filter, strides, padding, in_dtype = parameters(
-        # Currently, our schedule requires kernel_w be divisible by the number of simd lanes given
-        # its dtype. This means 3x3 and 5x5 kernels do not work on int16 or int8 for now. If you had
-        # to, you could hack around this by padding the data and kernel.
-        ((1, 48, 48, 8), (3, 3), 8, (1, 1), 1, "int32"),
-        ((1, 48, 48, 16), (3, 3), 16, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 24, 24, 32), (3, 3), 32, (1, 1), 1, "int32"),
-        ((1, 24, 24, 32), (3, 3), 32, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 12, 12, 64), (3, 3), 64, (1, 1), 1, "int32"),
-        ((1, 12, 12, 64), (3, 3), 64, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 6, 6, 128), (3, 3), 128, (1, 1), 1, "int32"),
-        ((1, 6, 6, 128), (3, 3), 128, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 3, 3, 256), (3, 3), 256, (1, 1), 1, "int32"),
-        ((1, 25, 5, 64), (3, 3), 64, (1, 1), 1, "int32"),
-        ((1, 24, 24, 8), (5, 5), 8, (1, 1), 1, "int32"),
-        ((1, 24, 24, 8), (3, 5), 8, (1, 1), 1, "int32"),
-        # These "evenly divisible" kernels work on smaller dtypes.
-        ((1, 48, 48, 8), (3, 2), 8, 1, 0, "int16"),
-        ((1, 48, 48, 8), (4, 4), 8, 1, 0, "int8"),
-    )
-    dilation = parameter(1)
-
-    data_layout = parameter("NCHW")
-    kernel_layout = parameter("OIHW")
-    out_layout = parameter("NHWC", "NCHW")
-    schedule_name = parameter("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-
-
 if __name__ == "__main__":
     main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
index 499d677e8f95..d48c7e138fba 100644
--- a/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
@@ -26,7 +26,7 @@
 from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
 
 
-def _change_ndarray_layout(arr, src_layout, dst_layout):
+def change_ndarray_layout(arr, src_layout, dst_layout):
     """Makes a copy of an ndarray, reshaping it to a new data layout.
 
     Parameter
@@ -96,7 +96,7 @@ def test_conv2d(
 
         ref_relay_op = relay.op.nn.conv2d(
             ref_input_var,
-            relay.const(_change_ndarray_layout(ref_kernel_data, "HWIO", self.ref_kernel_layout)),
+            relay.const(change_ndarray_layout(ref_kernel_data, "HWIO", self.ref_kernel_layout)),
             kernel_size=kernel_size,
             strides=strides,
             padding=padding,
@@ -113,11 +113,11 @@ def test_conv2d(
         # Reshape output dictionary to match out_layout
         assert len(ref_outputs) == 1
         output_tensor_name, output_tensor = next(iter(ref_outputs.items()))
-        ref_outputs[output_tensor_name] = _change_ndarray_layout(output_tensor, "NHWC", out_layout)
+        ref_outputs[output_tensor_name] = change_ndarray_layout(output_tensor, "NHWC", out_layout)
 
-        test_input_data = _change_ndarray_layout(ref_input_data, "NHWC", data_layout)
+        test_input_data = change_ndarray_layout(ref_input_data, "NHWC", data_layout)
         test_input_var = relay.var("input", relay.TensorType(test_input_data.shape, in_dtype))
-        test_kernel_data = _change_ndarray_layout(ref_kernel_data, "HWIO", kernel_layout)
+        test_kernel_data = change_ndarray_layout(ref_kernel_data, "HWIO", kernel_layout)
 
         test_relay_op = relay.op.nn.conv2d(
             test_input_var,
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
new file mode 100644
index 000000000000..573231f9632c
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -0,0 +1,358 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""microTVM cares a lot about the convolution + bias + requantize + fused ReLU use case. There have
+been some accuracy issues in the past, so this test steps through a model (MobileNetV1) layer by
+layer and ensures there is 1-1 correspondance at each step. This test would run way faster if we ran
+the model all at once, but then we wouldn't know which layers had issues.
+
+Furthermore, this test uses some in-development optimizations for microTVM that aren't part of the
+main pipeline.
+"""
+
+import numpy as np
+from PIL import Image
+import pytest
+
+import tvm
+import tvm.testing
+from tvm import meta_schedule, relay
+from tvm.testing.aot import AOTTestModel, run_and_check, AOTCompiledTestModel
+from tvm.relay.backend import Executor, Runtime
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+from tvm.contrib.download import download_testdata
+from test_generalized_conv2d import change_ndarray_layout
+
+
+# The model is the v0.7 version of the TinyML person detection (aka visual wake words) model. This
+# is an RGB 96x96 MobileNet V1 model.
+MODEL_URL = "https://github.com/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite"
+SAMPLE_URL = (
+    "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/elephant-299.jpg"
+)
+
+
+@pytest.fixture(scope="module")
+def interpreter():
+    """Returns a TFLite interpreter with the MLPerf Tiny visual wakewords model loaded, with an
+    elephant image run through it, and with all intermediate layer outputs saved."""
+
+    # Make sure the Tensorflow import is skipped if the test is being skipped. This is needed to
+    # prevent the "python: i386" tests from failing, as they don't have Tensorflow installed.
+    import tensorflow as tf  # pylint: disable=import-outside-toplevel
+
+    # Download the reference model
+    rel_model_path = "model_microtvm_mobilenetv1.tflite"
+    file = download_testdata(MODEL_URL, rel_model_path, overwrite=False)
+
+    # Load it into TensorFlow and allocate memory
+    interpreter = tf.lite.Interpreter(file, experimental_preserve_all_tensors=True)
+    interpreter.allocate_tensors()
+
+    # Download an image. The neuron activations are strange if we use random data or ones,
+    # so downloading an image is useful.
+    rel_image_path = "image_microtvm_mobilenetv1.jpg"
+    img_path = download_testdata(SAMPLE_URL, rel_image_path, overwrite=False)
+    image = Image.open(img_path).resize((96, 96))
+    image_data_hwc_uint8 = np.asarray(image)
+    assert image_data_hwc_uint8.shape == (96, 96, 3)
+    assert image_data_hwc_uint8.dtype == "uint8"
+    image_data_nhwc_int8 = (image_data_hwc_uint8 + 128).view("int8").reshape((1, 96, 96, 3))
+
+    # Load the image into the TFLite interpreter and compute all intermediate tensor values
+    input_details = interpreter.get_input_details()
+    interpreter.set_tensor(input_details[0]["index"], image_data_nhwc_int8)
+    interpreter.invoke()
+    return interpreter
+
+
+def _get_mobilenet_v1_layer_attributes(layer_num):
+    """Returns the relevant padding and stride for a given layer in a MobileNetV1 model. It's a huge
+    headache to read this data from TensorFlow, as it is not user accessible via the interpreter. If
+    we really wanted to, we would have to parse the .tflite file ourselves. This function is a bit
+    of a hack, but lets us skip that."""
+
+    if layer_num == 0:  # Regular conv2d
+        return ((0, 0, 1, 1), (2, 2), False)
+    if layer_num % 2 == 0:  # 1x1 conv2d
+        return ((0, 0, 0, 0), (1, 1), False)
+    if layer_num in [3, 7, 11, 23]:  # Downsizing depthwise_conv2d layers
+        return ((0, 0, 1, 1), (2, 2), True)
+    # Depthwise conv2d
+    return ((1, 1, 1, 1), (1, 1), True)
+
+
+def _get_relu_activation_prefix(layer_num):
+    if layer_num == 0:
+        return "model/activation/Relu;"
+    return f"model/activation_{layer_num}/Relu;"
+
+
+def _get_main_path_tensor_details(details, tensor_num):
+    """A "main path" tensor is a fused layer input/output. Gets the tensor details from the tensor
+    index, where 0 gives the original input tensor, 1 gives the output of the first fused
+    convolution layer, and so on. TFLite names are a little wack, so we get this information by
+    finding the SECOND tensor (which has the suffix "1") for each ReLU activation (the first tensor
+    is the bias)."""
+
+    if tensor_num == 0:
+        return details[0]
+    prefix = _get_relu_activation_prefix(tensor_num - 1)
+    detail = next(d for d in details if d["name"].startswith(prefix) and d["name"].endswith("1"))
+    assert len(detail["shape"]) == 4
+    assert detail["dtype"] == np.int8
+    return detail
+
+
+def _get_bias_details(details, layer_num):
+    """Gets the tensor details for the bias tensor for the corresponding convolution layer. The
+    bias tensors always appear before the main path tensors, so we don't have to check the ending to
+    make sure we have the right one."""
+    prefix = _get_relu_activation_prefix(layer_num)
+    detail = next(d for d in details if d["name"].startswith(prefix))
+    assert len(detail["shape"]) == 1
+    assert detail["dtype"] == np.int32
+    return detail
+
+
+def _get_kernel_details(details, layer_num):
+    """Gets the tensor details for the kernel tensor for the corresponding convolution layer. These
+    have a different naming scheme from the main path and bias tensors, as they are converted before
+    activation function fusion. Note that regular vs depthwise conv2ds have different prefixes."""
+
+    if layer_num == 0:
+        prefix = "model/conv2d/Conv2D"
+    elif layer_num % 2 == 0:
+        prefix = f"model/conv2d_{layer_num // 2}/"
+    else:
+        prefix = f"model/batch_normalization_{layer_num}/"
+
+    detail = next(d for d in details if d["name"].startswith(prefix))
+    assert len(detail["shape"]) == 4
+    assert detail["dtype"] == np.int8
+    return detail
+
+
+def _get_quant_scale_const(quantization_dict, as_scalar=False):
+    scales = quantization_dict["scales"]
+    if as_scalar:
+        assert len(scales) == 1
+        scales = scales[0]
+    return relay.const(scales, "float32")
+
+
+def _get_quant_zp_const(quantization_dict, as_scalar=False):
+    zero_points = quantization_dict["zero_points"]
+    if as_scalar:
+        assert len(zero_points) == 1
+        zero_points = zero_points[0]
+    return relay.const(zero_points, "int32")
+
+
+def _change_layout(data, old_layout, new_layout, dtype):
+    return change_ndarray_layout(data, old_layout, new_layout).astype(dtype)
+
+
+def _load_tflite_layer(interpreter, layer):
+    tensor_details = interpreter.get_tensor_details()
+
+    def lookup(detail):
+        return interpreter.get_tensor(detail["index"]), detail["quantization_parameters"]
+
+    input_data = lookup(_get_main_path_tensor_details(tensor_details, layer))
+    kernel_data = lookup(_get_kernel_details(tensor_details, layer))
+    bias_data = lookup(_get_bias_details(tensor_details, layer))
+    output_data = lookup(_get_main_path_tensor_details(tensor_details, layer + 1))
+    return input_data, kernel_data, bias_data, output_data
+
+
+def _make_relay_partial_func(relay_op, *args, **kwargs):
+    return lambda op: relay_op(op, *args, **kwargs)
+
+
+def _make_conv2d_op(kernel, data_quant, kernel_quant, hyperparams, is_depthwise=False):
+    dtype, padding, strides, data_layout, kernel_layout, output_layout = hyperparams
+    kernel_size = kernel.shape[1:3]
+    if is_depthwise:
+        channels = groups = kernel.shape[3]
+    else:
+        channels = kernel.shape[0]
+        groups = 1
+
+    kernel_ndarr = _change_layout(kernel, "OHWI", kernel_layout, dtype)
+
+    return _make_relay_partial_func(
+        relay.qnn.op.conv2d,
+        relay.const(kernel_ndarr, dtype),
+        input_zero_point=_get_quant_zp_const(data_quant, as_scalar=True),
+        kernel_zero_point=_get_quant_zp_const(kernel_quant),
+        input_scale=_get_quant_scale_const(data_quant, as_scalar=True),
+        kernel_scale=_get_quant_scale_const(kernel_quant),
+        kernel_size=kernel_size,
+        data_layout=data_layout,
+        kernel_layout="IOHW" if is_depthwise else kernel_layout,
+        dilation=(1, 1),
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        channels=channels,
+        out_dtype="int32",
+        out_layout=output_layout,
+    )
+
+
+def _make_bias_op(bias, output_layout):
+    requantize_axis = output_layout.index("C")
+    return _make_relay_partial_func(
+        relay.op.nn.bias_add,
+        relay.const(bias, "int32"),
+        axis=requantize_axis,
+    )
+
+
+def _make_requantize_op(bias_quant, output_quant, output_dtype, output_layout):
+    requantize_axis = output_layout.index("C")
+    return _make_relay_partial_func(
+        relay.qnn.op.requantize,
+        _get_quant_scale_const(bias_quant),
+        _get_quant_zp_const(bias_quant),
+        _get_quant_scale_const(output_quant, as_scalar=True),
+        _get_quant_zp_const(output_quant, as_scalar=True),
+        axis=requantize_axis,
+        compute_dtype="int64",
+        out_dtype=output_dtype,
+    )
+
+
+def _make_aot_model(params, hyperparams, layouts, is_depthwise=False):
+    tensors, quantizations = zip(*params)
+    data, kernel, bias, output = tensors
+    data_quant, kernel_quant, bias_quant, output_quant = quantizations
+
+    dtype, padding, _strides = hyperparams
+    data_layout, _, output_layout = layouts
+
+    if any(padding):
+        pad_const = int(data_quant["zero_points"][0])
+        pad_before = (0, padding[0], padding[1], 0)
+        pad_after = (0, padding[2], padding[3], 0)
+        data = np.pad(data, tuple(zip(pad_before, pad_after)), constant_values=pad_const)
+    data_ndarr = _change_layout(data, "NHWC", data_layout, dtype)
+    output_ndarr = _change_layout(output, "NHWC", output_layout, dtype)
+
+    input_var = relay.var("input", relay.TensorType(data_ndarr.shape, dtype))
+    conv2d = _make_conv2d_op(kernel, data_quant, kernel_quant, hyperparams + layouts, is_depthwise)
+    bias = _make_bias_op(bias, output_layout)
+    requantize = _make_requantize_op(bias_quant, output_quant, dtype, output_layout)
+
+    relay_mod = requantize(bias(conv2d(input_var)))
+    relay_func = relay.Function([input_var], relay_mod)
+    return AOTTestModel(
+        module=tvm.IRModule.from_expr(relay_func),
+        inputs={"input": data_ndarr},
+        outputs={"output": output_ndarr},
+        output_tolerance=1,
+    )
+
+
+def _make_target():
+    return tvm.target.Target("c -keys=arm_cpu -mcpu=cortex-m7")
+
+
+def _make_executor():
+    return Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": 8,
+            "constant-byte-alignment": 8,
+            "interface-api": "c",
+            "unpacked-api": True,
+        },
+    )
+
+
+@pytest.mark.parametrize("layer", range(23))
+@tvm.testing.requires_corstone300
+def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer):
+    """Checks microTVM output against TFLite for one MobileNetV1 layer.
+
+    Loads the input, kernel, bias, expected output, and quantization parameters from the specified
+    layer in a TFLite Interpreter. That information is used to construct a Relay Function with the
+    same structure. The Function is run using microTVM and AOTTestModel, and we verify microTVM's
+    output is the same as the TFLite ground truth.
+
+    This function only cross-checks the first 23 layers in MobileNetV1, which are regular and
+    depthwise 2D convolutions (this function only works for 2D convolutions). We do not test the
+    average pool, dense, or softmax layers at the end of the model.
+
+    Note that we disable the QNN Legalization pass. This allows TVM to use its QNN compute
+    definitions, fuse the three operations together, and perform other optimizations.
+
+    Parameters
+    ----------
+    interpreter: tensorflow.lite.python.interpreter.Interpreter
+        A TensorFlow Lite interpreter for a MobileNetV1 model, where invoke() has already been
+        called and experimental_preserve_all_tensors=True. Should be passed as a Pytest fixture.
+
+    layer: int
+        The index of the layer to check against TensorFlow's ground truth values.
+    """
+    dtype = "int16"
+
+    tensor, kernel, bias, output = _load_tflite_layer(interpreter, layer)
+
+    padding, strides, is_depthwise = _get_mobilenet_v1_layer_attributes(layer)
+    if is_depthwise:
+        data_layout, kernel_layout, output_layout = "NCHW", "OIHW", "NHWC"
+    else:
+        data_layout, kernel_layout, output_layout = "NHWC", "OHWI", "NHWC"
+
+    test_model = _make_aot_model(
+        (tensor, kernel, bias, output),
+        (dtype, padding, strides),
+        (data_layout, kernel_layout, output_layout),
+        is_depthwise=is_depthwise,
+    )
+
+    def schedule_fn(_sch):
+        return True
+
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+            "relay.backend.use_meta_schedule": True,
+            "relay.backend.tir_converter": "allow_extern",
+        },
+        disabled_pass=["qnn.Legalize"],
+    ), meta_schedule.database.ScheduleFnDatabase(schedule_fn):
+        executor_factory = tvm.relay.build(
+            test_model.module,
+            _make_target(),
+            executor=_make_executor(),
+            runtime=Runtime("crt"),
+            params=test_model.params,
+            mod_name=test_model.name,
+        )
+        compiled = AOTCompiledTestModel(model=test_model, executor_factory=executor_factory)
+
+    run_and_check(
+        models=[compiled],
+        runner=AOT_CORSTONE300_RUNNER,
+        interface_api="c",
+        workspace_byte_alignment=8,
+        constant_byte_alignment=8,
+    )
diff --git a/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
new file mode 100644
index 000000000000..46d2797ba394
--- /dev/null
+++ b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
@@ -0,0 +1,415 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for functions in tvm.topi.arm_cpu.mprofile.dsp.micro_kernel.tensordot.
+
+Contains a few unit tests, followed by integration tests for common use cases. Note that we do not
+run the generated code - we just make sure the strings match exactly.
+
+Note that a *lot* of instruction reordering happens during compilation from C to assembly (by GCC or
+Clang). I've verified that this instruction reordering happens correctly for all the functions here.
+For more details on why the generated code is the way it is, see `tensordot_int16_impl`."""
+
+import textwrap
+
+from tvm.topi.arm_cpu.mprofile.dsp.micro_kernel.tensordot import (
+    _get_tensor_halfwords,
+    _get_kernel_halfwords,
+    tensordot_int16_impl,
+)
+
+
+def test_get_tensor_halfwords():
+    """Tests the _get_tensor_halfwords helper function in tensordot.py.
+
+    This function loads the logical indices of the data that will be stored in memory at the tensor
+    pointer. See the function docstring for more details.
+    """
+
+    # fmt: off
+    # A simple 3x3 depthwise convolution computing one output and with in_stride = 1. Note that each
+    # row is padded with None at the end to make the rows word-aligned.
+    assert _get_tensor_halfwords((48, 3, 3), 0, 1, 1) == [
+        (0, 0), (0, 1), (0, 2), None,
+        (1, 0), (1, 1), (1, 2), None,
+        (2, 0), (2, 1), (2, 2), None
+    ]
+
+    # If the tensor width is odd, padding alternates before/after every row.
+    assert _get_tensor_halfwords((49, 3, 3), 0, 1, 1) == [
+        (0, 0), (0, 1), (0, 2), None,
+        None, (1, 0), (1, 1), (1, 2),
+        (2, 0), (2, 1), (2, 2), None
+    ]
+
+    # If we are computing multiple outputs, more tensor data becomes relevant.
+    assert _get_tensor_halfwords((48, 3, 3), 0, 2, 1) == [
+        (0, 0), (0, 1), (0, 2), (0, 3),
+        (1, 0), (1, 1), (1, 2), (1, 3),
+        (2, 0), (2, 1), (2, 2), (2, 3)
+    ]
+
+    # If offset=1, relevant data starts one halfword after the kernel pointer.
+    assert _get_tensor_halfwords((48, 3, 3), 1, 1, 1) == [
+        None, (0, 0), (0, 1), (0, 2),
+        None, (1, 0), (1, 1), (1, 2),
+        None, (2, 0), (2, 1), (2, 2)
+    ]
+
+    # These adjustments can be (and often are) used together.
+    assert _get_tensor_halfwords((49, 3, 3), 1, 2, 2) == [
+        None, (0, 0), (0, 1), (0, 2), (0, 3), (0, 4),
+        (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), None,
+        None, (2, 0), (2, 1), (2, 2), (2, 3), (2, 4)
+    ]
+    # fmt: on
+
+
+def test_get_kernel_halfwords():
+    """Tests the _get_kernel_halfwords helper function in tensordot.py.
+
+    This function loads the logical indices of the data that will be stored in memory at the kernel
+    pointer. See the function docstring for more details.
+    """
+
+    # fmt: off
+    # Example of a kernel for a 3x3 depthwise convolution channel
+    assert _get_kernel_halfwords((96, 3, 3), 0) == [
+        (0, 0), (0, 1), (0, 2),
+        (1, 0), (1, 1), (1, 2),
+        (2, 0), (2, 1), (2, 2),
+        None,
+    ]
+
+    # Example of a kernel for a 1x1 regular convolution with 4 channels
+    assert _get_kernel_halfwords((48, 1, 4), 1) == [
+        None, (0, 0), (0, 1), (0, 2), (0, 3), None,
+    ]
+    # fmt: on
+
+
+def test_write_3x3_depthwise_code():
+    """This is the function that would be generated for a 1x4x48x48 NCHW input tensor with "SAME"
+    padding. We are only computing one sum at once, so we don't need stride or output. Note that
+    this is pretty inefficient - it would be much better to compute a few sums concurrently.
+
+    When inlined, this code compiles (with armv7-a clang 11) into:
+
+    tensordot_opt_x1_int16_w48_3x3_000(int*, int*, int*, int*, int*):
+        ldr.w   lr, [r3]
+        ldrd    r11, r4, [r1]
+        ldrd    r5, r9, [r1, #96]
+        ldrd    r10, r8, [r1, #192]
+        ldm.w   r2, {r1, r6, r7}
+        ldr.w   r12, [sp, #36]
+        smlad   r1, r11, r1, lr
+        smlabb  r1, r4, r6, r1
+        smlatb  r1, r6, r5, r1
+        ldrd    r3, r2, [r2, #12]
+        smlatb  r1, r5, r7, r1
+        smlatb  r1, r7, r9, r1
+        smlad   r1, r10, r3, r1
+        ldr.w   r3, [r12]
+        smlabb  r1, r8, r2, r1
+        smmul   r1, r3, r1
+        ssat    r1, #8, r1, asr #8
+        strh    r1, [r0]
+    """
+    _, code = tensordot_int16_impl(1, (48, 3, 3), (0, 0, 0), (1, 1))
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
+    #define TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t tensordot_opt_x1_int16_w48_3x3_000(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+    ) {
+      int32_t sum_0 = *bias;
+
+      int32_t tensor__y00_x00__y00_x01 = tensor[0];
+      int32_t tensor__y00_x02__unknown = tensor[1];
+      int32_t tensor__y01_x00__y01_x01 = tensor[24];
+      int32_t tensor__y01_x02__unknown = tensor[25];
+      int32_t tensor__y02_x00__y02_x01 = tensor[48];
+      int32_t tensor__y02_x02__unknown = tensor[49];
+
+      int32_t kernel__y00_x00__y00_x01 = kernel[0];
+      int32_t kernel__y00_x02__y01_x00 = kernel[1];
+      int32_t kernel__y01_x01__y01_x02 = kernel[2];
+      int32_t kernel__y02_x00__y02_x01 = kernel[3];
+      int32_t kernel__y02_x02__unknown = kernel[4];
+
+      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
+      sum_0 = __smlabb(tensor__y00_x02__unknown, kernel__y00_x02__y01_x00, sum_0);
+      sum_0 = __smlabt(tensor__y01_x00__y01_x01, kernel__y00_x02__y01_x00, sum_0);
+      sum_0 = __smlatb(tensor__y01_x00__y01_x01, kernel__y01_x01__y01_x02, sum_0);
+      sum_0 = __smlabt(tensor__y01_x02__unknown, kernel__y01_x01__y01_x02, sum_0);
+      sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
+      sum_0 = __smlabb(tensor__y02_x02__unknown, kernel__y02_x02__unknown, sum_0);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + -128, 8);
+
+      ((int16_t*) output)[0] = (int16_t) requant_0;
+      return 0;
+    }
+    #endif
+    """
+    )
+
+
+def test_odd_width_3x3_depthwise_strides_code():
+    """This is the function that would be generated for a 1x4x48x48 NCHW input tensor with "SAME"
+    padding and (2, 2) strides, being written into NHWC layout. The layout change is encoded by
+    out_stride = 4. This is a common use case seen in MobileNetV1, among others.
+
+    Note that despite the rows not being word-aligned, the *tensor pointer will always be word
+    aligned (satisfying this requirement) since y_stride = 2."""
+
+    _, code = tensordot_int16_impl(2, (49, 3, 3), (0, 0, 0), (2, 4))
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
+    #define TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t tensordot_opt_x2_int16_w49_3x3_000_2_4(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+    ) {
+      int32_t sum_0 = *bias, sum_1 = *bias;
+
+      int32_t tensor__y00_x00__y00_x01 = tensor[0];
+      int32_t tensor__y00_x02__y00_x03 = tensor[1];
+      int32_t tensor__y00_x04__unknown = tensor[2];
+      int32_t tensor__unknown__y01_x00 = tensor[24];
+      int32_t tensor__y01_x01__y01_x02 = tensor[25];
+      int32_t tensor__y01_x03__y01_x04 = tensor[26];
+      int32_t tensor__y02_x00__y02_x01 = tensor[49];
+      int32_t tensor__y02_x02__y02_x03 = tensor[50];
+      int32_t tensor__y02_x04__unknown = tensor[51];
+
+      int32_t kernel__y00_x00__y00_x01 = kernel[0];
+      int32_t kernel__y00_x02__y01_x00 = kernel[1];
+      int32_t kernel__y01_x01__y01_x02 = kernel[2];
+      int32_t kernel__y02_x00__y02_x01 = kernel[3];
+      int32_t kernel__y02_x02__unknown = kernel[4];
+
+      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
+      sum_0 = __smlabb(tensor__y00_x02__y00_x03, kernel__y00_x02__y01_x00, sum_0);
+      sum_0 = __smlatt(tensor__unknown__y01_x00, kernel__y00_x02__y01_x00, sum_0);
+      sum_0 = __smlad(tensor__y01_x01__y01_x02, kernel__y01_x01__y01_x02, sum_0);
+      sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0);
+      sum_0 = __smlabb(tensor__y02_x02__y02_x03, kernel__y02_x02__unknown, sum_0);
+      sum_1 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x00__y00_x01, sum_1);
+      sum_1 = __smlabb(tensor__y00_x04__unknown, kernel__y00_x02__y01_x00, sum_1);
+      sum_1 = __smlatt(tensor__y01_x01__y01_x02, kernel__y00_x02__y01_x00, sum_1);
+      sum_1 = __smlad(tensor__y01_x03__y01_x04, kernel__y01_x01__y01_x02, sum_1);
+      sum_1 = __smlad(tensor__y02_x02__y02_x03, kernel__y02_x00__y02_x01, sum_1);
+      sum_1 = __smlabb(tensor__y02_x04__unknown, kernel__y02_x02__unknown, sum_1);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + -128, 8);
+      int32_t requant_1 = (sum_1 * (int64_t) scale_val) >> 32;
+      requant_1 = (requant_1 + 1) >> 1;
+      requant_1 = __ssat(requant_1 + -128, 8);
+
+      ((int16_t*) output)[0] = (int16_t) requant_0;
+      ((int16_t*) output)[4] = (int16_t) requant_1;
+      return 0;
+    }
+    #endif
+    """
+    )
+
+
+def test_1x1x8_convolution_code():
+    """This is the function that would be generated for a 1x48x48x8 NHWC input tensor under
+    standard convolution with a 1x1 kernel. This is a common use case seen in MobileNetV1,
+    among others. In this scenario, a very high amount of memory re-use means that summing
+    four channels at once makes us faster."""
+
+    _, code = tensordot_int16_impl(4, (48 * 8, 1, 8), (0, 0, 0), (8, 1))
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
+    #define TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t tensordot_opt_x4_int16_w384_1x8_000_8_1(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+    ) {
+      int32_t sum_0 = *bias, sum_1 = *bias, sum_2 = *bias, sum_3 = *bias;
+
+      int32_t tensor__y00_x00__y00_x01 = tensor[0];
+      int32_t tensor__y00_x02__y00_x03 = tensor[1];
+      int32_t tensor__y00_x04__y00_x05 = tensor[2];
+      int32_t tensor__y00_x06__y00_x07 = tensor[3];
+      int32_t tensor__y00_x08__y00_x09 = tensor[4];
+      int32_t tensor__y00_x0a__y00_x0b = tensor[5];
+      int32_t tensor__y00_x0c__y00_x0d = tensor[6];
+      int32_t tensor__y00_x0e__y00_x0f = tensor[7];
+      int32_t tensor__y00_x10__y00_x11 = tensor[8];
+      int32_t tensor__y00_x12__y00_x13 = tensor[9];
+      int32_t tensor__y00_x14__y00_x15 = tensor[10];
+      int32_t tensor__y00_x16__y00_x17 = tensor[11];
+      int32_t tensor__y00_x18__y00_x19 = tensor[12];
+      int32_t tensor__y00_x1a__y00_x1b = tensor[13];
+      int32_t tensor__y00_x1c__y00_x1d = tensor[14];
+      int32_t tensor__y00_x1e__y00_x1f = tensor[15];
+
+      int32_t kernel__y00_x00__y00_x01 = kernel[0];
+      int32_t kernel__y00_x02__y00_x03 = kernel[1];
+      int32_t kernel__y00_x04__y00_x05 = kernel[2];
+      int32_t kernel__y00_x06__y00_x07 = kernel[3];
+
+      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0);
+      sum_0 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x02__y00_x03, sum_0);
+      sum_0 = __smlad(tensor__y00_x04__y00_x05, kernel__y00_x04__y00_x05, sum_0);
+      sum_0 = __smlad(tensor__y00_x06__y00_x07, kernel__y00_x06__y00_x07, sum_0);
+      sum_1 = __smlad(tensor__y00_x08__y00_x09, kernel__y00_x00__y00_x01, sum_1);
+      sum_1 = __smlad(tensor__y00_x0a__y00_x0b, kernel__y00_x02__y00_x03, sum_1);
+      sum_1 = __smlad(tensor__y00_x0c__y00_x0d, kernel__y00_x04__y00_x05, sum_1);
+      sum_1 = __smlad(tensor__y00_x0e__y00_x0f, kernel__y00_x06__y00_x07, sum_1);
+      sum_2 = __smlad(tensor__y00_x10__y00_x11, kernel__y00_x00__y00_x01, sum_2);
+      sum_2 = __smlad(tensor__y00_x12__y00_x13, kernel__y00_x02__y00_x03, sum_2);
+      sum_2 = __smlad(tensor__y00_x14__y00_x15, kernel__y00_x04__y00_x05, sum_2);
+      sum_2 = __smlad(tensor__y00_x16__y00_x17, kernel__y00_x06__y00_x07, sum_2);
+      sum_3 = __smlad(tensor__y00_x18__y00_x19, kernel__y00_x00__y00_x01, sum_3);
+      sum_3 = __smlad(tensor__y00_x1a__y00_x1b, kernel__y00_x02__y00_x03, sum_3);
+      sum_3 = __smlad(tensor__y00_x1c__y00_x1d, kernel__y00_x04__y00_x05, sum_3);
+      sum_3 = __smlad(tensor__y00_x1e__y00_x1f, kernel__y00_x06__y00_x07, sum_3);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + -128, 8);
+      int32_t requant_1 = (sum_1 * (int64_t) scale_val) >> 32;
+      requant_1 = (requant_1 + 1) >> 1;
+      requant_1 = __ssat(requant_1 + -128, 8);
+      int32_t requant_2 = (sum_2 * (int64_t) scale_val) >> 32;
+      requant_2 = (requant_2 + 1) >> 1;
+      requant_2 = __ssat(requant_2 + -128, 8);
+      int32_t requant_3 = (sum_3 * (int64_t) scale_val) >> 32;
+      requant_3 = (requant_3 + 1) >> 1;
+      requant_3 = __ssat(requant_3 + -128, 8);
+
+      int32_t packed_res_0 = requant_0 + (requant_1 << 16);
+      int32_t packed_res_1 = requant_2 + (requant_3 << 16);
+      output[0] = packed_res_0;
+      output[1] = packed_res_1;
+      return 0;
+    }
+    #endif
+    """
+    )
+
+
+def test_3x3x3_offset_convolution_code():
+    """This is the function that would be generated for a 1x96x96x3 NHWC input tensor under
+    standard convolution with a 3x3x3 kernel - the first layer of MobileNetV1. This is special, as
+    it means that every other kernel channel will not start on an even numbered halfword. We won't
+    have this issue for the input tensor, as we will always compute two positions at a time.
+
+    To solve this 'every other' issue, we will need two different version of this function to
+    alternate between. This alternation will be handled in TIR scheduling. Here, we just test the
+    version where the kernel is not word aligned.
+
+    Also tests the requantize_shift and output_zero_point keyword args. These might be needed for
+    some ResNet models (like image classification from MLPerf Tiny).
+    """
+
+    _, code = tensordot_int16_impl(
+        1,
+        (96 * 3, 3, 9),
+        (1, 1, 1),
+        (3, 1),
+        requantize_shift=40,
+        output_zero_point=4,
+    )
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
+    #define TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t tensordot_opt_x1_int16_w288_3x9_111(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale
+    ) {
+      int32_t sum_0 = *bias;
+
+      int32_t tensor__unknown__y00_x00 = tensor[0];
+      int32_t tensor__y00_x01__y00_x02 = tensor[1];
+      int32_t tensor__y00_x03__y00_x04 = tensor[2];
+      int32_t tensor__y00_x05__y00_x06 = tensor[3];
+      int32_t tensor__y00_x07__y00_x08 = tensor[4];
+      int32_t tensor__unknown__y01_x00 = tensor[144];
+      int32_t tensor__y01_x01__y01_x02 = tensor[145];
+      int32_t tensor__y01_x03__y01_x04 = tensor[146];
+      int32_t tensor__y01_x05__y01_x06 = tensor[147];
+      int32_t tensor__y01_x07__y01_x08 = tensor[148];
+      int32_t tensor__unknown__y02_x00 = tensor[288];
+      int32_t tensor__y02_x01__y02_x02 = tensor[289];
+      int32_t tensor__y02_x03__y02_x04 = tensor[290];
+      int32_t tensor__y02_x05__y02_x06 = tensor[291];
+      int32_t tensor__y02_x07__y02_x08 = tensor[292];
+
+      int32_t kernel__unknown__y00_x00 = kernel[0];
+      int32_t kernel__y00_x01__y00_x02 = kernel[1];
+      int32_t kernel__y00_x03__y00_x04 = kernel[2];
+      int32_t kernel__y00_x05__y00_x06 = kernel[3];
+      int32_t kernel__y00_x07__y00_x08 = kernel[4];
+      int32_t kernel__y01_x00__y01_x01 = kernel[5];
+      int32_t kernel__y01_x02__y01_x03 = kernel[6];
+      int32_t kernel__y01_x04__y01_x05 = kernel[7];
+      int32_t kernel__y01_x06__y01_x07 = kernel[8];
+      int32_t kernel__y01_x08__y02_x00 = kernel[9];
+      int32_t kernel__y02_x01__y02_x02 = kernel[10];
+      int32_t kernel__y02_x03__y02_x04 = kernel[11];
+      int32_t kernel__y02_x05__y02_x06 = kernel[12];
+      int32_t kernel__y02_x07__y02_x08 = kernel[13];
+
+      sum_0 = __smlatt(tensor__unknown__y00_x00, kernel__unknown__y00_x00, sum_0);
+      sum_0 = __smlad(tensor__y00_x01__y00_x02, kernel__y00_x01__y00_x02, sum_0);
+      sum_0 = __smlad(tensor__y00_x03__y00_x04, kernel__y00_x03__y00_x04, sum_0);
+      sum_0 = __smlad(tensor__y00_x05__y00_x06, kernel__y00_x05__y00_x06, sum_0);
+      sum_0 = __smlad(tensor__y00_x07__y00_x08, kernel__y00_x07__y00_x08, sum_0);
+      sum_0 = __smlatb(tensor__unknown__y01_x00, kernel__y01_x00__y01_x01, sum_0);
+      sum_0 = __smlabt(tensor__y01_x01__y01_x02, kernel__y01_x00__y01_x01, sum_0);
+      sum_0 = __smlatb(tensor__y01_x01__y01_x02, kernel__y01_x02__y01_x03, sum_0);
+      sum_0 = __smlabt(tensor__y01_x03__y01_x04, kernel__y01_x02__y01_x03, sum_0);
+      sum_0 = __smlatb(tensor__y01_x03__y01_x04, kernel__y01_x04__y01_x05, sum_0);
+      sum_0 = __smlabt(tensor__y01_x05__y01_x06, kernel__y01_x04__y01_x05, sum_0);
+      sum_0 = __smlatb(tensor__y01_x05__y01_x06, kernel__y01_x06__y01_x07, sum_0);
+      sum_0 = __smlabt(tensor__y01_x07__y01_x08, kernel__y01_x06__y01_x07, sum_0);
+      sum_0 = __smlatb(tensor__y01_x07__y01_x08, kernel__y01_x08__y02_x00, sum_0);
+      sum_0 = __smlatt(tensor__unknown__y02_x00, kernel__y01_x08__y02_x00, sum_0);
+      sum_0 = __smlad(tensor__y02_x01__y02_x02, kernel__y02_x01__y02_x02, sum_0);
+      sum_0 = __smlad(tensor__y02_x03__y02_x04, kernel__y02_x03__y02_x04, sum_0);
+      sum_0 = __smlad(tensor__y02_x05__y02_x06, kernel__y02_x05__y02_x06, sum_0);
+      sum_0 = __smlad(tensor__y02_x07__y02_x08, kernel__y02_x07__y02_x08, sum_0);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 39;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + 4, 8);
+
+      ((int16_t*) output)[1] = (int16_t) requant_0;
+      return 0;
+    }
+    #endif
+    """
+    )
diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py
index ce379b6b2cb3..cb24353539a4 100644
--- a/tests/scripts/request_hook/request_hook.py
+++ b/tests/scripts/request_hook/request_hook.py
@@ -145,6 +145,7 @@
     "https://github.com/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy": f"{BASE}/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy",
     "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy": f"{BASE}/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy",
     "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite": f"{BASE}/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite",
+    "https://github.com/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite": f"{BASE}/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite",
     "https://github.com/uwsampl/web-data/raw/main/vta/models/synset.txt": f"{BASE}/2022-10-05/synset.txt",
     "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_cora.torch": f"{BASE}/gcn_cora.torch",
     "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg": f"{BASE}/vta_cat.jpg",