diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 53aec11e5816..e956c82828c1 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -877,6 +877,23 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts): return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs) +# QNN ops +@reg.register_alter_op_layout("add") +def alter_op_layout_add(attrs, inputs, tinfos, out_type): + """Alter the layout of a add op. + + Useful for fusing the bias constant with an input zero point constant in a previous quantized + op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn. + """ + return topi.nn.qnn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type) + + +@reg.register_alter_op_layout("qnn.requantize") +def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type): + """Alter the layout of a requantization op.""" + return topi.nn.qnn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type) + + # bitpack @reg.register_compute("nn.bitpack") def compute_bitpack(attrs, inputs, out_dtype): diff --git a/python/tvm/relay/qnn/strategy/__init__.py b/python/tvm/relay/qnn/strategy/__init__.py index 05778c3e9f86..d7b669a4fa42 100644 --- a/python/tvm/relay/qnn/strategy/__init__.py +++ b/python/tvm/relay/qnn/strategy/__init__.py @@ -20,4 +20,5 @@ from __future__ import absolute_import as _abs from .generic import * +from . import arm_cpu from . import hexagon diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py b/python/tvm/relay/qnn/strategy/arm_cpu.py new file mode 100644 index 000000000000..f8653817835e --- /dev/null +++ b/python/tvm/relay/qnn/strategy/arm_cpu.py @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Quantized operator strategy for Arm CPU. + +As quantized op schedules, these are only used if the qnn.Legalize pass is disabled. The current +schedules only work for fused operators with bias, as this is the most common use case. Only +regular/depthwise conv2d is supported, but qnn_dense will be added eventually.""" + +from tvm import topi, TVMError +from .generic import qnn_conv2d_strategy +from ... import op as _op +from ...op.strategy.generic import is_depthwise_conv2d + + +@qnn_conv2d_strategy.register("arm_cpu") +def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target): + """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP. + + When computing convolutions, we want data that will be used to compute the same output values to + be adjacent in memory, as this lets us reuse memory loads and use more SIMD instructions. + + For depthwise convolutions, channels do not interact with each other, so the NCHW and IOHW + layouts to the best job of keeping "related" data close. In contrast, computing one output of a + regular convolution requires reading all input channels, so NHWC and OHWI are best. Hence, these + are the layouts we support. + """ + + if not (target.features.has_dsp and "cortex-m" in target.mcpu): + raise TVMError( + "Quantized Arm schedules only exist for Cortex-M with DSP! " + "The qnn.Legalize pass should be run for other Arm processors." + ) + + data = inputs[0] + kernel = inputs[1] + data_layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + groups = attrs.groups + strategy = _op.OpStrategy() + + if groups == 1: + if data_layout == "NHWC" and kernel_layout == "OHWI": + strategy.add_implementation( + topi.arm_cpu.qnn_conv2d, + topi.arm_cpu.schedule_qnn_conv2d, + name="qnn_conv2d.arm_cpu", + ) + elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups): + if data_layout == "NCHW" and kernel_layout == "IOHW": + strategy.add_implementation( + topi.arm_cpu.qnn_depthwise_conv2d, + topi.arm_cpu.schedule_qnn_depthwise_conv2d, + name="qnn_depthwise_conv2d.arm_cpu", + ) + else: + raise TVMError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d") + + return strategy diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py index 20f92a8895dd..eba102662bc4 100644 --- a/python/tvm/topi/arm_cpu/__init__.py +++ b/python/tvm/topi/arm_cpu/__init__.py @@ -23,9 +23,11 @@ from .conv2d_transpose import * from .conv2d_int8 import * from . import conv2d_alter_op +from . import qnn_alter_op from .bitserial_conv2d import * from .bitserial_dense import * from .injective import * from .group_conv2d import * from .pooling import * from .dense import * +from .qnn import * diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py index fc46f4b34f9d..ab489161a8fa 100644 --- a/python/tvm/topi/arm_cpu/conv2d.py +++ b/python/tvm/topi/arm_cpu/conv2d.py @@ -37,10 +37,6 @@ conv2d_nhwc_dsp_compute, conv2d_nhwc_dsp_schedule, ) -from .mprofile.dsp.tensordot_conv2ds import ( - conv2d_nhwc_ohwi_dsp_compute, - tensordot_conv2ds_schedule, -) @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu") @@ -522,17 +518,3 @@ def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype): def schedule_conv2d_nhwc_dsp(cfg, outs): """Create schedule for conv2d_nhwc_dsp""" return conv2d_nhwc_dsp_schedule(cfg, outs) - - -@autotvm.register_topi_compute("conv2d_nhwc_ohwi_dsp.arm_cpu") -def conv2d_nhwc_ohwi_dsp(cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype): - """Compute conv2d_nhwc_ohwi with v7e-m DSP instructions and the tensordot kernel.""" - return conv2d_nhwc_ohwi_dsp_compute( - cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype - ) - - -@autotvm.register_topi_schedule("conv2d_nhwc_ohwi_dsp.arm_cpu") -def schedule_conv2d_nhwc_ohwi_dsp(cfg, outs): - """Create schedule for conv2d_nhwc_ohwi.""" - return tensordot_conv2ds_schedule(cfg, outs) diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py index 9284b9474513..b6c15a30c037 100644 --- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py +++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py @@ -31,10 +31,6 @@ depthwise_conv2d_nhwc_dsp_compute, depthwise_conv2d_nhwc_dsp_schedule, ) -from .mprofile.dsp.tensordot_conv2ds import ( - depthwise_conv2d_nchw_oihw_dsp_compute, - tensordot_conv2ds_schedule, -) @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu") @@ -722,19 +718,3 @@ def depthwise_conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs): """Create schedule for conv2d_nhwc_dsp""" return depthwise_conv2d_nhwc_dsp_schedule(cfg, outs) - - -@autotvm.register_topi_compute("depthwise_conv2d_nchw_oihw_dsp.arm_cpu") -def depthwise_conv2d_nchw_oihw_dsp( - cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype -): - """Compute depthwise_conv2d_nchw_oihw with v7e-m DSP instructions and the tensordot kernel.""" - return depthwise_conv2d_nchw_oihw_dsp_compute( - cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype - ) - - -@autotvm.register_topi_schedule("depthwise_conv2d_nchw_oihw_dsp.arm_cpu") -def schedule_depthwise_conv2d_nchw_oihw_dsp(cfg, outs): - """Create schedule for depthwise_conv2d_nchw_oihw.""" - return tensordot_conv2ds_schedule(cfg, outs) diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py index 0fdffc06cf4f..1d36e1dd1e9c 100644 --- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py @@ -14,142 +14,391 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Computes a "jumpy tensordot" operator, which can be used to tensorize many common operators -including regular conv2d, depthwise conv2d, and grouped conv2d provided the data and kernel layouts -are the optimal ones. When groups=1, the optimal data layout is NHWC and kernel layout is OHWI. When -this is a depthwise convolution, the optimal data layout is NCHW and kernel layout is OIHW.""" +"""Generates optimized code to compute a tensor dot product on ARMv7E-M. +This function can be used to tensorize many common operators including regular conv2d, depthwise +conv2d, and grouped conv2d for some data and kernel layouts. When for regular convolution, use data +layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout data layout is NCHW +and kernel layout OIHW. + +The generated code will also work on v8-M chips that have the DSP instructions (unlike v7E-M, they +are optional in v8-M). Note that the generated code does not use the (potentially very useful) MVE +instructions present on some v8-M chips. +""" + +from dataclasses import dataclass +from itertools import chain import textwrap +from typing import Iterator, Optional, Tuple -from tvm import te, tir -from .common import num_simd_lanes_per_word +@dataclass +class SMLAInstruction: + """Class for keeping track of an item in inventory.""" + instruction: str + tensor_var: str + kernel_var: str -def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix): - """Gets the C function name of the tensordot function.""" - return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}" + def call_with_acle(self, accumulator_var: str) -> str: + return ( + f"{accumulator_var} = __{self.instruction}" + f"({self.tensor_var}, {self.kernel_var}, {accumulator_var});" + ) + def has_same_operands(self, other: "SMLAInstruction") -> bool: + return self.tensor_var == other.tensor_var and self.kernel_var == other.kernel_var -def make_intrin_tensordot(slices, strides, tensordot_params): - """Helper function for constructing tensordot intrinsic. We can't construct the whole thing here - (as multiple schedules use tensordot and each must build the intrinstic differently) but we can - build part here to simplify the code.""" - # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params - data, kernel, output = slices - data_strides, kernel_strides = strides +def _get_c_function_name(num_outputs, dimensions, offsets, x_strides): + """Generates a C function name for tensordot. - data_buf = tir.decl_buffer( - data.shape, data.dtype, name="data", offset_factor=1, strides=data_strides - ) - kernel_buf = tir.decl_buffer( - kernel.shape, - kernel.dtype, - name="kernel", - offset_factor=1, - strides=kernel_strides, - ) - output_buf = tir.decl_buffer( - output.shape, output.dtype, name="output", offset_factor=1, strides=[1] + We do not need a suffix, as the generated function will have an #include guard. Unlike other + microTVM operators, _get_c_function_name is never called externally. + """ + tensor_w, kernel_h, kernel_w = dimensions + return ( + f"tensordot_opt_x{num_outputs}_int16_w{tensor_w}_" + + f"{kernel_h}x{kernel_w}_" + + "".join(map(str, offsets)) + + (f"_{x_strides[0]}_{x_strides[1]}" if num_outputs > 1 else "") ) - def intrin_func(ins, outs): - builder = tir.ir_builder.create() - builder.emit( - tir.call_extern( - "int32", - _get_func_name(*tensordot_params), - outs[0].access_ptr("w"), - ins[0].access_ptr("r"), - ins[1].access_ptr("r"), - ) - ) - return builder.get() - return te.decl_tensor_intrin( - output.op, - intrin_func, - binds={data: data_buf, kernel: kernel_buf, output: output_buf}, - ) +def _init_biased_accumulators(num_outputs): + """Generates code to load the bias into the accumulators. + + Addition is commutative, so we could add the bias before, during, or after performing our + multiply-accumulate operations. Where we add the bias does not change the overflow behavior. + + Doing the bias add takes one cycle either way (if done at the beginning we can't use a SMULXY + trick to set sum_i to zero for "free"). However, doing it at the beginning frees up a register, + so we'll do it first. + """ + assignments = [f"sum_{x:x} = *bias" for x in range(num_outputs)] + joined_assignments = ", ".join(assignments) + return f"int32_t {joined_assignments};" + + +def _get_tensor_halfwords(dimensions, offset, num_outputs, in_stride) -> Iterator[Optional[Tuple]]: + """Gets the logical indices of the data that will be stored in memory at the tensor pointer. + + Returns an Iterator of Optional[Tuple], while skipping over word-aligned pairs of unrelated + halfwords. The returned iterator is as short as possible while having even length and containing + all relevant tensor data. Tuples in the returned Iterator represent an (y, x) offset from the + top-left tensor position being used in this convolution. We need to be aware of the None values + so our code is correctly word-aligned. + + One consequence of these requirements - each row in the tensor is broken into word-aligned pairs + of halfwords (which are later combined into full words). See the test cases (located in + tests/python/topi/python/test_topi_conv2d_tensordot_opts.py) for usage examples. + """ + + tensor_w, kernel_h, kernel_w = dimensions + max_x_val = (num_outputs - 1) * in_stride + kernel_w + halfwords = [] + + for y in range(kernel_h): + # If needed, pad so the beginning of the row is word-aligned + if (y * tensor_w + offset) % 2 == 1: + halfwords.append(None) + + for x in range(max_x_val): + halfwords.append((y, x)) + + # If needed, pad so the row length is word aligned + if (y * tensor_w + offset + max_x_val) % 2 == 1: + halfwords.append(None) + return halfwords + + +def _get_kernel_halfwords(dimensions, offset) -> Iterator[Optional[Tuple]]: + """Gets the logical indices of the data that will be stored in memory at the kernel pointer. + Returns an Iterator of Optional[Tuple]. The returned iterator is as short as possible while + having even length and containing all kernel data. Tuples in the returned Iterator represent + an (y, x) position in the kernel, while None values represent other, irrelevant data. We need + to be aware of the None values so our code is correctly word-aligned. -def tensordot_impl(in_dtype: str, tensor_h: int, jump: int, tensor_w: int, suffix: str) -> str: - """Generates C code for taking the dot products of two `tensor_h` * `tensor_w` tensors. Also has - a `jump` argument that advances the pointer of one tensor by that many words after each row. The - `jump` and `tensor_w` values must be word-aligned for the input data type, as non-word-aligned - memory access is slow on the Cortex-M series. Depending on the input datatype, the code may - contain DSP instructions for Arm v7e-m. C code contains DSP instructions for Arm v7e-m. See - the below pseudocode for reference: - - tensordot(out_ptr, dat_ptr, ker_ptr) { - sum = 0; - for (i = 0; i < tensor_h; i++) { - for (j = 0; j < tensor_w; j++) { - sum += (*dat_ptr++) * (*ker_ptr++); - } - dat_ptr += jump; - } - *out_ptr = sum; - } + See test cases in tests/python/topi/python/test_topi_conv2d_tensordot_opts.py for examples. """ + _, kernel_h, kernel_w = dimensions + halfwords = [] - simd_lanes = num_simd_lanes_per_word(in_dtype) - assert tensor_w % simd_lanes == 0 - assert jump % simd_lanes == 0 + # Kernel data starts `offset` places after the pointer value + if offset == 1: + halfwords.append(None) - if in_dtype == "int8": - inner_loop = """ - uint32_t tensor_c20 = __SXTB16(tensor_batch); - uint32_t kernel_c20 = __SXTB16(kernel_batch); - sum = __SMLAD(tensor_c20, kernel_c20, sum); + for y in range(kernel_h): + for x in range(kernel_w): + halfwords.append((y, x)) - uint32_t tensor_c31 = __SXTB16(__ROR(tensor_batch, 8)); - uint32_t kernel_c31 = __SXTB16(__ROR(kernel_batch, 8)); - sum = __SMLAD(tensor_c31, kernel_c31, sum);""" + # Make sure the returned iterator has even length by padding with an "unknown" value. We want + # even length as this corresponds to an integer number of int32 words. + if (kernel_h * kernel_w + offset) % 2 == 1: + halfwords.append(None) + return halfwords - elif in_dtype == "int16": - inner_loop = """ - sum = __SMLAD(tensor_batch, kernel_batch, sum);""" - elif in_dtype == "int32": - inner_loop = """ - // Compiles to a single MAC instruction - sum += tensor_batch * kernel_batch;""" +def _get_int16_alias(position) -> str: + if position is None: + return "unknown" + y, x = position + return f"y{y:0>2x}_x{x:0>2x}" + + +def _load_tensor_vars(halfwords, tensor_w) -> Iterator[str]: + assert len(halfwords) % 2 == 0 + offset = int(not bool(halfwords[0])) + + for i in range(0, len(halfwords), 2): + var_name = f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}" + y, x = halfwords[i + 1] or halfwords[i] + tensor_index = (y * tensor_w + x + offset) // 2 + yield f"int32_t tensor__{var_name} = tensor[{tensor_index}];" + + +def _load_kernel_vars(halfwords) -> Iterator[str]: + assert len(halfwords) % 2 == 0 + for i in range(0, len(halfwords), 2): + var_name = f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}" + yield f"int32_t kernel__{var_name} = kernel[{i // 2}];" + + +def _get_draft_macs( + kernel_dims, tensor_halfwords, kernel_halfwords, offset +) -> Iterator[SMLAInstruction]: + """Generates unrolled MAC instructions to compute one tensordot sum. + + Unrolling these loops increases code size a tiny bit (< 0.02 KB), but makes the generated code + much faster. The generated code does not use SIMD instructions - they are added later by + _apply_simd_optimizations. + + We return an iterator of SMLAInstruction named tuples. Returning an iterator lets us do + optimizations by iterator chaining. + """ + + def get_var(y, x, halfwords) -> Tuple[str, str]: + i = halfwords.index((y, x)) + if i % 2 == 0: + return f"{_get_int16_alias((y, x))}__{_get_int16_alias(halfwords[i + 1])}", "b" + return f"{_get_int16_alias(halfwords[i - 1])}__{_get_int16_alias((y, x))}", "t" + + kernel_h, kernel_w = kernel_dims + for y in range(kernel_h): + for x in range(kernel_w): + tensor_var, tensor_half = get_var(y, x + offset, tensor_halfwords) + kernel_var, kernel_half = get_var(y, x, kernel_halfwords) + instruction = f"smla{tensor_half}{kernel_half}" + yield SMLAInstruction(instruction, f"tensor__{tensor_var}", f"kernel__{kernel_var}") + + +def _apply_simd_optimizations(instruction_tuples) -> Iterator[SMLAInstruction]: + """When possible, fuses single MACs into SIMD MAC instructions. + + The compiler cannot do this automatically, as calling __smlaxy forces the SMLAxy instruction to + be used. This function takes as input an iterator of SMLAInstructions and returns an iterator of + SMLAInstructions (possibly of different length). + """ + curr_tuple = next(instruction_tuples, None) + while curr_tuple: + next_tuple = next(instruction_tuples, None) + if next_tuple is None: + yield curr_tuple + break + + if curr_tuple.has_same_operands(next_tuple): + instructions = sorted([curr_tuple.instruction, next_tuple.instruction]) + if instructions == ["smlabb", "smlatt"]: + yield SMLAInstruction("smlad", curr_tuple.tensor_var, curr_tuple.kernel_var) + next_tuple = next(instruction_tuples, None) + elif instructions == ["smlabt", "smlatb"]: + yield SMLAInstruction("smladx", curr_tuple.tensor_var, curr_tuple.kernel_var) + next_tuple = next(instruction_tuples, None) + else: + yield curr_tuple + + else: + yield curr_tuple + curr_tuple = next_tuple + + +def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]: + """Converts an iterator of SMLAInstructions into lines of C code. + + We want the compiler to re-order these with the memory loads, so we generate them as a series of + calls to instruction aliases instead of as a single `asm` block. + """ + + for smla_instruction in instruction_tuples: + assert "smla" in smla_instruction.instruction + + # We call the instruction using the Arm C Language Extensions. Using ACLE gives better + # cross-compiler compatibility than using __builtin functions. + yield smla_instruction.call_with_acle(f"sum_{index}") + + +def _requantize_sums(num_outputs, requantize_shift, output_zero_point) -> Iterator[str]: + """Generates code to requantize the accumulator values. + + The generated code does not use floating point instructions, as it simulates floating point + multiplication with an a int64 multiply + shift. The bias is added at the beginning, so we can + skip doing it now. The shift is hard-coded, as this saves a few cycles without hurting accuracy + in "most" cases. + + It's *possible* we could save one more cycle here by pre-multiplying the bias with the + requantize multiplier, and then doing the bias addition and shift in the same cycle (via ). + However, it's complicated and only saves one cycle. + + It's also worth noting the SSAT16 operation doesn't help us here. The data isn't stored as two + halfwords in a word, and rearrainging it would take at least one cycle. Two SSAT operations is + just as good. + + Calling __ssat directly is a little bit gross, but GCC and Clang are unreliable about compiling + other ways of writing this. Both the multiply + shift and shift + saturation combine to one + instruction each. + """ + + yield "int32_t scale_val = *scale;" + for i in range(num_outputs): + yield f"int32_t requant_{i} = (sum_{i} * (int64_t) scale_val) >> {requantize_shift - 1};" + yield f"requant_{i} = (requant_{i} + 1) >> 1;" + yield f"requant_{i} = __ssat(requant_{i} + {output_zero_point}, 8);" + + +def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]: + """Generates code to write the requantized sums to memory. + + Note - halfword packing here *does* help. It seems + like it wouldn't, as doing two pipelined int16 stores takes two cycles - the same as halfword + packing plus a pipelined int32 store. We still do the int16 stores when there is an output + stride, though. + + However, this lets the compiler re-order instructions to better preserve memory, as it doesn't + like breaking apart the store instructions (as this messes up pipelining). + """ + + if stride > 1: + for i in range(num_outputs): + yield f"((int16_t*) output)[{i * stride + offset}] = (int16_t) requant_{i};" else: - raise ValueError(f"No tensordot implementation exists for dtype '{in_dtype}'!") + num_packed = (num_outputs - offset) // 2 + for i in range(num_packed): + index = 2 * i + offset + yield f"int32_t packed_res_{i} = requant_{index} + (requant_{index + 1} << 16);" - function_name = _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix) - return textwrap.dedent( - ( - f""" - #include - #include + if offset == 1: + yield "((int16_t*) output)[1] = (int16_t) requant_0;" - #ifdef __cplusplus - extern "C" - #endif - __STATIC_FORCEINLINE int32_t {function_name}( - uint32_t *out, - uint32_t *tensor, - uint32_t *kernel) {{ - - uint32_t sum = 0; - - #pragma GCC unroll {tensor_h} - for (int i = 0; i < {tensor_h}; i++) {{ - #pragma GCC unroll {tensor_w // simd_lanes} - for (int j = 0; j < {tensor_w // simd_lanes}; j++) {{ - uint32_t tensor_batch = *tensor++; - uint32_t kernel_batch = *kernel++; - {inner_loop.strip()} - }} - tensor += {jump // simd_lanes}; - }} - out[0] = sum; + for i in range(num_packed): + yield f"output[{offset + i}] = packed_res_{i};" + + if (offset + num_outputs) % 2 == 1: + yield f"((int16_t*) output)[{num_packed * 2}] = (int16_t) requant_{num_packed * 2};" + + +def tensordot_int16_impl( + num_outputs: int, + dimensions: Tuple[int, int, int], + offsets: Tuple[int, int, int], + x_strides: Tuple[int, int], + requantize_shift: int = 33, + output_zero_point: int = -128, +) -> Tuple[str, str]: + """Generates code to compute a tensor dot product with requantization. + + The generated function takes pointers to the output, tensor, and kernel as input. All pointers + must be word aligned. Only works with `int16` data type. The generated code is optimized for the + ARMv7E-M architecture. + + Parameters + ---------- + num_outputs: int + The number of tensordot outputs to compute per function call. Computing more than one at + once makes us much faster by reducing how often overlapping data is loaded. However, setting + this too high causes us to run out of registers and need to store data on the stack. We + should autotune this, but num_outputs=2 is usually OK. + + dimensions: Tuple[int, int, int] + The dimensions of each tensordot operation. dimensions[1] and dimensions[2] are the height + and width of the kernel, respectively. dimensions[0] is the width of the data tensor, which + is usually larger than the kernel. + + offsets: Tuple[int, int, int] + Each value is 0 or 1, and represents how far after the given data, kernel, and output + pointers (respectively) we should start reading/writing. This prevents us from having to + check if each pointer is aligned or unaligned at runtime, making us faster. + + x_strides: Tuple[int, int] + The distance (in halfwords) between the start of each input tensor, and where to write each + output result respectively. Only used when num_outputs > 1. + + requantize_shift: int + The distance to right shift after multiplying by the requantization scale. Defaults to 33, + as this lets us skip a shift operation. + + outout_zero_point: int + The output zero point, which will be subtracted after scale multiplication but before + clipping. Defaults to -128, as most models always use this. + + Returns + ------- + func_name, func_code: Tuple[str, str] + The name and source code of the generated function. + """ + function_name = _get_c_function_name(num_outputs, dimensions, offsets, x_strides) + tensor_w, kernel_h, kernel_w = dimensions + tensor_offset, kernel_offset, output_offset = offsets + assert tensor_offset < 2 and kernel_offset < 2 and output_offset < 2 + in_stride, out_stride = x_strides + + tensor_halfwords = _get_tensor_halfwords(dimensions, tensor_offset, num_outputs, in_stride) + kernel_halfwords = _get_kernel_halfwords(dimensions, kernel_offset) + load_tensor_lines = _load_tensor_vars(tensor_halfwords, tensor_w) + load_kernel_lines = _load_kernel_vars(kernel_halfwords) + + def gen_single_loop_macs(index): + draft_macs_iter = _get_draft_macs( + (kernel_h, kernel_w), tensor_halfwords, kernel_halfwords, index * in_stride + ) + draft_macs_iter = _apply_simd_optimizations(draft_macs_iter) + return _expand_instruction_tuples(draft_macs_iter, index) + + multiply_acc_lines = chain.from_iterable(gen_single_loop_macs(i) for i in range(num_outputs)) + requantize_lines = _requantize_sums( + num_outputs, requantize_shift=requantize_shift, output_zero_point=output_zero_point + ) + write_out_lines = _write_sums_to_memory(num_outputs, output_offset, out_stride) + + def insert_lines(lines): + return ("\n" + " " * 10).join(lines) + + # It's very common for one model to have different layers that use identical tensordot + # functions. To prevent function re-definition errors, we need an #include guard. This is better + # than adding a random suffix, as it saves flash memory. + code = textwrap.dedent( + f""" + #ifndef {function_name.upper()}_EXISTS + #define {function_name.upper()}_EXISTS + #include + __attribute__((always_inline)) static inline int32_t {function_name}( + int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale + ) {{ + {_init_biased_accumulators(num_outputs)} + + {insert_lines(load_tensor_lines)} + + {insert_lines(load_kernel_lines)} + + {insert_lines(multiply_acc_lines)} + + {insert_lines(requantize_lines)} + + {insert_lines(write_out_lines)} return 0; }} + #endif """ - ) ) + return (function_name, code) diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py b/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py deleted file mode 100644 index 79564f98edfc..000000000000 --- a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py +++ /dev/null @@ -1,296 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Implementations of several conv2d variations, all tensorized using tensordot and optimized for -Cortex-M DSP. Currently contains a standard conv2d and depthwise conv2d implementation, but could be -extended to add a grouped conv2d operator. Due to the way we tensorize, this schedule ONLY works -when the data and kernel layouts are NCHWxc and OIHWxi respectively, where x is the number of -input channels divided by the number of groups.""" - -import random -import string -from typing import Callable, Tuple, Union - -import tvm -from tvm import te -from tvm.tir import indexdiv, indexmod -from tvm.topi.utils import traverse_inline -from tvm.topi.nn.pad import pad - -from .micro_kernel.tensordot import ( - make_intrin_tensordot, - tensordot_impl, -) - - -def _unpack_2d_argument(argument: Union[int, Tuple]) -> Tuple: - if isinstance(argument, int): - return (argument, argument) - assert len(argument) == 2 - return argument - - -def _check_no_dilation(dilation: Union[int, Tuple]) -> None: - """Takes a dilation argument as an integer or tuple, and makes sure both dimensions are 1. - Dilation prevents us from using DSP instructions, so this schedule can't work (aside from the - niche case where dilation_h == stride_h and dilation_w == stride_w, which is rare enough we - probably don't need to support it).""" - - dilation_h, dilation_w = _unpack_2d_argument(dilation) - assert dilation_h == dilation_w == 1 - - -def _unpack_padding(padding: Tuple) -> Tuple: - assert isinstance(padding, tuple) - if len(padding) == 2: - (pad_up, pad_down), (pad_left, pad_right) = padding - else: - pad_up, pad_left, pad_down, pad_right = padding - return pad_up, pad_left, pad_down, pad_right - - -def _pad_if_needed(data: te.tensor.Tensor, layout: str, padding: Tuple) -> te.tensor.Tensor: - """Performs padding on a te.tensor.Tensor object if necessary. If padding = (0, 0, 0, 0), the - input tensor is returned unmodified. We only care about tuples here - "VALID" and "SAME" padding - will be converted by the importer TFLite importer if present.""" - - pad_up, pad_left, pad_down, pad_right = padding - if not any(padding): - return data - - # We want to pad the "H" and "W" columns, and their position depends on the layout - pad_before, pad_after = [0, 0, 0, 0], [0, 0, 0, 0] - pad_before[layout.index("H")] = pad_up - pad_before[layout.index("W")] = pad_left - pad_after[layout.index("H")] = pad_down - pad_after[layout.index("W")] = pad_right - return pad(data, pad_before, pad_after, name="padded_data") - - -def _compute_output_dim( - data_dim: int, kernel_dim: int, pad_before: int, pad_after: int, stride: int -) -> int: - """Computes an output dimension of a convolution, given the data dimension, kernel dimension, - padding, and stride along that axis. Note that when stride > 1, this division will often not - be perfectly even.""" - return (data_dim + pad_before + pad_after - kernel_dim) // stride + 1 - - -def _wrap_te_compute( - shape: Tuple, - fcompute: Callable[[int, int, int, int], tvm.ir.PrimExpr], - desired_out_layout: str, - current_out_layout: str = "NHWC", - **kwargs, -) -> te.tensor.Tensor: - """Wrapper over te.compute that allows the output layout to be easily changed.""" - assert current_out_layout.isalpha() and desired_out_layout.isalpha() - assert sorted(current_out_layout) == sorted(desired_out_layout) - forward_order = (current_out_layout.index(c) for c in desired_out_layout) - reverse_order = (desired_out_layout.index(c) for c in current_out_layout) - - return te.compute( - tuple(shape[i] for i in forward_order), - lambda *args: fcompute(*(args[i] for i in reverse_order)), - **kwargs, - ) - - -def _get_suffix() -> str: - """Returns a random eight-character string to append to C function names. Prevents accidental - re-definition of functions if the same operator appears twice in a Relay graph.""" - return "".join(random.choices(string.ascii_uppercase, k=8)) - - -def conv2d_nhwc_ohwi_dsp_compute( - _cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype -): - """Standard conv2d schedule that can be tensorized using tensordot.""" - - stride_h, stride_w = _unpack_2d_argument(strides) - pad_up, pad_left, pad_down, pad_right = _unpack_padding(padding) - _check_no_dilation(dilation) - - batch_size, data_h, data_w, in_channels = data.shape - output_channels, kernel_h, kernel_w, _ = kernel.shape - assert kernel.shape[3] == in_channels - - output_h = _compute_output_dim(data_h, kernel_h, pad_up, pad_down, stride_h) - output_w = _compute_output_dim(data_w, kernel_w, pad_left, pad_right, stride_w) - - kh_i = te.reduce_axis((0, kernel_h), name="kh_i") - kw_i = te.reduce_axis((0, kernel_w), name="kw_i") - kc_i = te.reduce_axis((0, in_channels), name="rc") - - padded_data = _pad_if_needed(data, "NHWC", (pad_up, pad_left, pad_down, pad_right)) - return _wrap_te_compute( - (batch_size, output_h, output_w, output_channels), - lambda n, y, x, c: te.sum( - padded_data[n, y * stride_h + kh_i, x * stride_w + kw_i, kc_i].astype(out_dtype) - * kernel[c, kh_i, kw_i, kc_i].astype(out_dtype), - axis=(kh_i, kw_i, kc_i), - ), - out_layout, - name="conv2d", - tag="conv2d_nhwc_ohwi_dsp", - ) - - -def _make_conv2d_tensorization(padded_data, kernel): - _, _, padded_w, in_channels = padded_data.shape - _, kernel_h, kernel_w, _ = kernel.shape - in_dtype = padded_data.dtype - suffix = _get_suffix() - assert in_dtype == kernel.dtype - - data_slice = te.placeholder((kernel_h, kernel_w, in_channels), name="a", dtype=in_dtype) - kernel_slice = te.placeholder((kernel_h, kernel_w, in_channels), name="b", dtype=in_dtype) - - kh_i = te.reduce_axis((0, kernel_h), name="kh_i") - kw_i = te.reduce_axis((0, kernel_w), name="kw_i") - kc_i = te.reduce_axis((0, in_channels), name="kc_i") - - output_slice = te.compute( - (1,), - lambda k: te.sum( - data_slice[kh_i, kw_i, kc_i].astype("int32") - * kernel_slice[kh_i, kw_i, kc_i].astype("int32"), - axis=[kh_i, kw_i, kc_i], - ), - name="c", - ) - - # TVM has a really strange bug where the outer reduction axis (kh_i) having length 1 causes the - # decl_buffer strides check to fail. height_stride is a dark magic workaround for this. - height_stride = in_channels * padded_w if kernel_h > 1 else in_channels - jump = (padded_w - kernel_w) * in_channels - tensordot_params = (in_dtype, kernel_h, jump, kernel_w * in_channels, suffix) - intrin_tensordot = make_intrin_tensordot( - (data_slice, kernel_slice, output_slice), - ([height_stride, in_channels, 1], [kernel_w * in_channels, in_channels, 1]), - tensordot_params, - ) - - tensordot_code = tensordot_impl(*tensordot_params) - return (intrin_tensordot, tensordot_code) - - -def depthwise_conv2d_nchw_oihw_dsp_compute( - _cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype -): - """Depthwise conv2d schedule that can be tensorized using tensordot.""" - - stride_h, stride_w = _unpack_2d_argument(strides) - pad_up, pad_left, pad_down, pad_right = _unpack_padding(padding) - _check_no_dilation(dilation) - - batch_size, in_channels, data_h, data_w = data.shape - _, c_mul, kernel_h, kernel_w = kernel.shape - output_channels = in_channels * c_mul - assert kernel.shape[0] == in_channels - - output_h = _compute_output_dim(data_h, kernel_h, pad_up, pad_down, stride_h) - output_w = _compute_output_dim(data_w, kernel_w, pad_left, pad_right, stride_w) - - kh_i = te.reduce_axis((0, kernel_h), name="kh_i") - kw_i = te.reduce_axis((0, kernel_w), name="kw_i") - - padded_data = _pad_if_needed(data, "NCHW", (pad_up, pad_left, pad_down, pad_right)) - return _wrap_te_compute( - (batch_size, output_h, output_w, output_channels), - lambda n, y, x, c: te.sum( - padded_data[ - n, - indexdiv(c, c_mul), - y * stride_h + kh_i, - x * stride_w + kw_i, - ].astype(out_dtype) - * kernel[indexdiv(c, c_mul), indexmod(c, c_mul), kh_i, kw_i].astype(out_dtype), - axis=(kh_i, kw_i), - ), - out_layout, - name="depthwise_conv2d", - tag="depthwise_conv2d_nchw_oihw_dsp", - ) - - -def _make_depthwise_conv2d_tensorization(padded_data, kernel): - _, _, _, padded_w = padded_data.shape - _, _, kernel_h, kernel_w = kernel.shape - - in_dtype = padded_data.dtype - suffix = _get_suffix() - assert in_dtype == kernel.dtype - - data_slice = te.placeholder((kernel_h, kernel_w), name="a", dtype=in_dtype) - kernel_slice = te.placeholder((kernel_h, kernel_w), name="b", dtype=in_dtype) - - kh_i = te.reduce_axis((0, kernel_h), name="kh_i") - kw_i = te.reduce_axis((0, kernel_w), name="kw_i") - - output_slice = te.compute( - (1,), - lambda k: te.sum( - data_slice[kh_i, kw_i].astype("int32") * kernel_slice[kh_i, kw_i].astype("int32"), - axis=[kh_i, kw_i], - ), - name="c", - ) - - jump = padded_w - kernel_w - tensordot_params = (in_dtype, kernel_h, jump, kernel_w, suffix) - intrin_tensordot = make_intrin_tensordot( - (data_slice, kernel_slice, output_slice), - ([padded_w, 1], [kernel_w, 1]), - tensordot_params, - ) - - tensordot_code = tensordot_impl(*tensordot_params) - return (intrin_tensordot, tensordot_code) - - -def tensordot_conv2ds_schedule(_cfg, outs): - """Schedule function using v7e-m DSP instructions for all the conv2d operators in this file. We - use one schedule function for them all, because they are tensorized with the same kernel.""" - - schedule = te.create_schedule([x.op for x in outs]) - - def _callback(operator): - if "conv2d" in operator.tag: - output = operator.output(0) - padded_data = output.op.input_tensors[0] - kernel = output.op.input_tensors[1] - - if operator.tag == "conv2d_nhwc_ohwi_dsp": - b_ax, y_ax, x_ax, co_ax = schedule[output].op.axis - kh_ax, kw_ax, ci_ax = schedule[output].op.reduce_axis - schedule[output].reorder(b_ax, y_ax, x_ax, co_ax, kh_ax, kw_ax, ci_ax) - intrin, code = _make_conv2d_tensorization(padded_data, kernel) - - elif operator.tag == "depthwise_conv2d_nchw_oihw_dsp": - b_ax, y_ax, x_ax, co_ax = schedule[output].op.axis - kh_ax, kw_ax = schedule[output].op.reduce_axis - schedule[output].reorder(b_ax, co_ax, y_ax, x_ax, kh_ax, kw_ax) - intrin, code = _make_depthwise_conv2d_tensorization(padded_data, kernel) - - else: - raise ValueError(f"Cannot tensorize {operator.tag} with tensordot!") - - schedule[output].tensorize(kh_ax, intrin) - schedule[output].pragma(b_ax, "import_c", code) - - traverse_inline(schedule, outs[-1].op, _callback) - return schedule diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py new file mode 100644 index 000000000000..fad64cc09bb8 --- /dev/null +++ b/python/tvm/topi/arm_cpu/qnn.py @@ -0,0 +1,370 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Contains TVMScript implementations of some QNN operators for Arm. + +Currently, the only ops with compute functions are fused regular and depthwise convolutions for +Arm Cortex-M with DSP. +""" + +from typing import Tuple + +import tvm +from tvm import te +from tvm.tir import const +from tvm.script import tir as T +from ..utils import get_const_tuple +from .mprofile.dsp.micro_kernel import tensordot + + +def int_ceil_division(x, y): + return -(x // -y) + + +def _compute_output_dim(data_length, kernel_length, stride): + return int_ceil_division(data_length + 1 - kernel_length, stride) + + +def _pick_tensordot_impl(attrs, inputs, num_outputs=2, is_depthwise=False): + """Helper function that chooses the right implementation of micro_kernel.tensordot. + + Takes as input the parameters of the conv2d, and returns a tuple of TWO (function_name, + function_code). The first pair (the aligned one) is for even numbered output channels, and the + second pair (the offset one) is for odd-numbered output channels. This function is used for + regular and depthwise convolutions. + + We need different implementations for even vs odd numbered output channels, because the "start" + of an odd output channel in the data tensor or kernel might or might not be on a word boundary, + and the tensordot code expects all input pointers to be word-aligned. + """ + data, kernel = inputs[0:2] + rq_output_zero_point_const = inputs[10] + assert len(rq_output_zero_point_const.op.body) == 1 + output_zero_point = rq_output_zero_point_const.op.body[0] + + _, stride_w = get_const_tuple(attrs.strides) + + if is_depthwise: + assert attrs.data_layout == "NCHW" + assert attrs.kernel_layout == "IOHW" + _, _, height, width = get_const_tuple(data.shape) + _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape) + + dimensions = (width, kernel_h, kernel_w) + in_stride = stride_w + data_per_oc_size = height * width + else: + assert attrs.data_layout == "NHWC" + assert attrs.kernel_layout == "OHWI" + _, height, width, in_channels = get_const_tuple(data.shape) + out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape) + + dimensions = (width * in_channels, kernel_h, kernel_w * in_channels) + in_stride = in_channels * stride_w + data_per_oc_size = 0 + + assert attrs.out_layout is not None + if attrs.out_layout == "NHWC": + out_stride = out_channels + elif attrs.out_layout == "NCHW": + out_stride = 1 + else: + raise ValueError(f"Unsupported output layout {attrs.out_layout}!") + + x_strides = (in_stride, out_stride) + aligned_func = tensordot.tensordot_int16_impl( + num_outputs, + dimensions, + (0, 0, 0), + x_strides, + output_zero_point=output_zero_point, + ) + + kernel_per_oc_size = dimensions[1] * dimensions[2] + + offsets = (data_per_oc_size % 2, kernel_per_oc_size % 2, 0) + offset_func = tensordot.tensordot_int16_impl( + num_outputs, + dimensions, + offsets, + x_strides, + output_zero_point=output_zero_point, + ) + + return (aligned_func, offset_func) + + +def _make_tscript_ptr(buffer, offset, length, dtype="int16"): + return T.tvm_access_ptr( + T.type_annotation(dtype=dtype), + buffer.data, + offset, + length, + 1, + dtype="handle", + ) + + +def _make_tscript_call(func_name, *args): + return T.evaluate(T.call_extern(func_name, *args, dtype="int32")) + + +def _make_conv2d_primfunc( + call_dimensions: Tuple, + buffer_shapes: Tuple[Tuple, Tuple, Tuple, Tuple, Tuple], + aligned_func: Tuple[str, str], + offset_func: Tuple[str, str], + ptr_gens: Tuple, +): + height, width, out_channels = call_dimensions + data_shape, kernel_shape, bias_shape, scale_shape, output_shape = buffer_shapes + aligned_func_name, aligned_func_code = aligned_func + offset_func_name, offset_func_code = offset_func + output_ptr, data_ptr, kernel_ptr = ptr_gens + + # If the functions are identical, we can skip the second loop + if aligned_func_name == offset_func_name: + aligned_channels = out_channels + offset_channels = tvm.tir.const(0) + c_step = tvm.tir.const(1) + else: + aligned_channels = out_channels // 2 + offset_channels = out_channels // 2 + c_step = tvm.tir.const(2) + + def bias_ptr(bias, c): + return _make_tscript_ptr(bias, c, 1, dtype="int32") + + def scale_ptr(scale, c): + return _make_tscript_ptr(scale, c, 1, dtype="int32") + + @T.prim_func + def biased_quantized_conv2d( + data_handle: T.handle, + kernel_handle: T.handle, + bias_handle: T.handle, + scale_handle: T.handle, + output_handle: T.handle, + ) -> None: + + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + data = T.match_buffer(data_handle, data_shape, dtype="int16") + kernel = T.match_buffer(kernel_handle, kernel_shape, dtype="int16") + bias = T.match_buffer(bias_handle, bias_shape, dtype="int32") + + # We don't specify a data type for the requantization scale, even though we will read it as + # an int32. This is because we must pretend it is a float32, as Relay's requantize op only + # allows floating point scales. + scale = T.match_buffer(scale_handle, scale_shape) + output = T.match_buffer(output_handle, output_shape, dtype="int16") + + # This hack prevents TVM from seeing these variables as "unused". I should be using T.reads + # and T.writes, but they don't work. I think it's an issue with BufferTouchedDomain. + # pylint: disable=unused-variable + output[0, 0, 0, 0] = 0 + __1 = data[0, 0, 0, 0] + __2 = kernel[0, 0, 0, 0] + __3 = bias[0, 0, 0, 0] + __4 = scale[0] + # pylint: enable=unused-variable + + for c_ax, y_ax, x_ax in T.grid(aligned_channels, height, width): + with T.block("conv2d_aligned"): + T.block_attr({"pragma_import_c": aligned_func_code}) + y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax]) + _make_tscript_call( + aligned_func_name, + output_ptr(output, y, x, c * c_step), + data_ptr(data, y, x, c * c_step), + kernel_ptr(kernel, c * c_step), + bias_ptr(bias, c * c_step), + scale_ptr(scale, c * c_step), + ) + + for c_ax, y_ax, x_ax in T.grid(offset_channels, height, width): + with T.block("conv2d_offset"): + T.block_attr({"pragma_import_c": offset_func_code}) + y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax]) + _make_tscript_call( + offset_func_name, + output_ptr(output, y, x, c * c_step + 1), + data_ptr(data, y, x, c * c_step + 1, offset=1), + kernel_ptr(kernel, c * c_step + 1, offset=1), + bias_ptr(bias, c * c_step + 1), + scale_ptr(scale, c * c_step + 1), + ) + + return biased_quantized_conv2d + + +def qnn_conv2d(attrs, inputs, out_type): + """Compute for qnn.conv2d with NHWC layout. + + Note that this is a DIFFERENT layout from the Hexagon variant, because they have special + instructions Cortex-M doesn't have. We expect the kernel to have OHWI layout. We also assume + that padding is not necessary, as it will have been done by another pass. + """ + + # Make a few checks to unpack the function arguments and ensure it was called with the right + # arguments. Note that unlike most schedules, qnn_conv2d does not use a wrapper. + assert len(inputs) == 11 + data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8] + output_layout = attrs.out_layout + assert output_layout == "NHWC" + + _, height, width, in_channels = get_const_tuple(data.shape) + out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape) + y_stride, x_stride = get_const_tuple(attrs.strides) + + out_height = _compute_output_dim(height, kernel_h, y_stride) + out_width = _compute_output_dim(width, kernel_w, x_stride) + + # Decide how many sums our function should have running at the same time. Doing + # this lets us do "more work" for each memory load, but doing too many of them causes us to run + # out of registers. Currently this is set to either 1 or 2, but autotuning this value would + # improve performance a lot. Tracked by https://github.com/apache/tvm/issues/13528. + + num_outputs = 2 + + # Next, decide whether whether we need "parity alternation". For example, if we have an + # 8x3x3x3 kernel (8 output channels, height 3, width 3, input channels 3) in the OHWI layout, + # then every output channel kernel slice will be 27 halfwords. This means every other output + # channel will not be word aligned, which will cause slowness/crashes! + + # We solve this problem by handling the "aligned" and "offset" output channels with different + # versions of our tensordot function. The "aligned func" assumes that the start positions of the + # output, data, and kernel are given exactly by their pointer. The "offset" version assumes that + # the "true" start of the output is the value in the output pointer, plus an offset of 0 or 1. + # _pick_tensordot_impl decides whether this is the case. If not, we only want to generate one + # function (to save flash), so offset_func is a tuple of empty strings. + + aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, False) + + # Helper functions to make pointers + def output_ptr(buffer, y, x, c): + return _make_tscript_ptr( + buffer, + y * const(out_width * out_channels) + x * const(out_channels * num_outputs) + c, + 1, + ) + + # We need to disable pylint's unused argument checker, as the kwarg offset is unused but must + # be present for compatibility. We cannot add an underscore as we normally would, as this makes + # the keyword not match. + + # pylint: disable=unused-argument + def data_ptr(buffer, y, x, c, offset=0): + return _make_tscript_ptr( + buffer, + y * const(y_stride * width * in_channels) + + x * const(x_stride * num_outputs * in_channels), + 1, + ) + + # pylint: enable=unused-argument + + def kernel_ptr(buffer, c, offset=0): + return _make_tscript_ptr( + buffer, + c * const(kernel_h * kernel_w * in_channels) - offset, + 1, + ) + + prim_func = _make_conv2d_primfunc( + (const(out_height), const(out_width // num_outputs), const(out_channels)), + (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape), + aligned_func, + offset_func, + (output_ptr, data_ptr, kernel_ptr), + ) + + output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16") + return [output] + + +def schedule_qnn_conv2d(_attrs, _outs, _target): + """Schedule function for qnn.conv2d.""" + return None + + +def qnn_depthwise_conv2d(attrs, inputs, out_type): + """Compute for qnn.depthwise_conv2d with NCHW layout. + + Works basically the same way as regular conv2d - see above. + """ + + assert len(inputs) == 11 + data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8] + output_layout = attrs.out_layout + assert output_layout == "NHWC" + + _, _, height, width = get_const_tuple(data.shape) + _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape) + _, out_height, out_width, _ = get_const_tuple(out_type.shape) + y_stride, x_stride = get_const_tuple(attrs.strides) + + out_height = _compute_output_dim(height, kernel_h, y_stride) + out_width = _compute_output_dim(width, kernel_w, x_stride) + + num_outputs = 2 + + aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, num_outputs, True) + + # Helper functions for making pointers. + def output_ptr(buffer, y, x, c): + return _make_tscript_ptr( + buffer, + y * const(out_width * out_channels) + x * const(out_channels * num_outputs) + c, + 1, + ) + + def data_ptr(buffer, y, x, c, offset=0): + if height * width % 2 == 1: + x_ptr_offset = tvm.tir.const(-1) + else: + x_ptr_offset = tvm.tir.const(0) + + return _make_tscript_ptr( + buffer, + c * const(width * height) + + y * const(y_stride * width) + + x * const(x_stride * num_outputs) + + offset * x_ptr_offset, + 1, + ) + + def kernel_ptr(buffer, c, offset=0): + return _make_tscript_ptr( + buffer, + c * tvm.tir.const(kernel_h * kernel_w) - offset, + 1, + ) + + prim_func = _make_conv2d_primfunc( + (const(out_height), const(out_width // num_outputs), const(out_channels)), + (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape), + aligned_func, + offset_func, + (output_ptr, data_ptr, kernel_ptr), + ) + + output = te.extern_primfunc([data, kernel, bias, scale], prim_func, name="tir", dtype="int16") + return [output] + + +def schedule_qnn_depthwise_conv2d(_attrs, _outs, _target): + """Schedule function for qnn.depthwise_conv2d.""" + return None diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py b/python/tvm/topi/arm_cpu/qnn_alter_op.py new file mode 100644 index 000000000000..00225493db96 --- /dev/null +++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py @@ -0,0 +1,122 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Arm Cortex-M specific optimizations for quantized operators.""" + +import numpy as np + +from tvm import nd, relay, target +from ..nn import qnn_requantize_alter_layout, qnn_add_alter_layout + + +@qnn_requantize_alter_layout.register(["arm_cpu"]) +def alter_requantize_layout(attrs, inputs, _tinfos, _out_type): + """Changes a floating point requantize op to use int64 multiply + shift for microTVM. + + Usually, this is done by QNN legalization. However, microTVM wants to manually choose the + integer rounding constants in order to: + (a) Have int32, not int64 constants + (b) Use a constant rounding shift to skip a memory load. + + Ideally, we would pick these constants in the requantize (or fused) schedule. Unfortunately that + is not currently possible, so we pick them with `alter_layout` as a hack. This will only work if + the requantize schedule "plays along" with this hack. + """ + + # Only microTVM Cortex-M boards with DSP use the relevant schedules + current_target = target.Target.current(allow_none=False) + if not (current_target.features.has_dsp and "cortex-m" in current_target.mcpu): + return None + + _, in_scale, _, out_scale, _ = inputs + in_scale_numpy = in_scale.data.numpy().astype("float64") + out_scale_scalar = out_scale.data.numpy().item() + + # Shifting by 33 and rounding means shifting by 32, adding 1, and shifting by 1 again. This is + # useful, because shifting a multiplication product by 32 can be done for "free" with SMMUL + scales = ((in_scale_numpy / out_scale_scalar) * 2**33).astype("int32") + + # Requantize ops in Relay do not support int32 scales - if we try to use one, requantize.cc will + # raise an error. As a hacky work-around, we change the scale dtype to float32, without changing + # underlying data. This works, as our compute function knows to interpret the scale as an int32. + + # This is only a work-around - a better long-term solution would be adding a new integer + # requantize op, which takes integer scales, shifts, and rounding behavior. + fake_float_scales = scales.view("float32") + + scale_constant = relay.Constant(nd.array(fake_float_scales)) + return relay.qnn.op.requantize(inputs[0], scale_constant, *inputs[2:], **attrs) + + +def _is_qnn_op_depthwise_conv2d(qnn_conv2d_op): + return relay.op.strategy.generic.is_depthwise_conv2d( + qnn_conv2d_op.args[0].type_annotation.shape, + qnn_conv2d_op.attrs.data_layout, + qnn_conv2d_op.args[1].data.shape, + qnn_conv2d_op.attrs.kernel_layout, + qnn_conv2d_op.attrs.groups, + ) + + +@qnn_add_alter_layout.register(["arm_cpu"]) +def alter_add_layout(_attrs, inputs, _tinfos, _out_type): + """Fuses the zero point for a previous quantized operator with this add operation. + + Currently only supports qnn.conv2d, but qnn.dense support should be added. Note that this + optimization means we must pad tensors with the input zero point, and NOT with zero. + """ + + prev_op, biases = inputs + if not hasattr(prev_op, "op"): + return None + if prev_op.op.name != "qnn.conv2d": + return None + + # We should not perform this alteration if the target has a uint * int SIMD MAC operation (since + # these do (x - (-128)) * y efficiently, and conv_input_zp is usually -128). For now, we + # restrict this optimization to just Cortex-M devices, but it might be helpful on others too. + current_target = target.Target.current(allow_none=False) + if not "cortex-m" in current_target.mcpu: + return None + + conv_input_zp = prev_op.args[2].data.numpy().item() + kernel = prev_op.args[1].data.numpy() + + if _is_qnn_op_depthwise_conv2d(prev_op): + axes_to_sum = "HW" + elif prev_op.attrs.groups == 1: + axes_to_sum = "HWI" + else: + # This alteration does not currently support grouped conv2d + return None + axes_to_sum = tuple(map(prev_op.attrs.kernel_layout.index, axes_to_sum)) + element_sums = np.sum(kernel, axis=axes_to_sum).flatten() + + # The zero point is subtracted from the input elements, so we need a "-" sign here + zp_shifted_sums = element_sums * (-conv_input_zp) + + # We want to make sure new_biases is representable as an int32. It's tempting to just check + # whether arr.dtype == "int32" (since Numpy will automatically increase dtype in some cases) + # but this leads to weird wrapping behavior and doesn't work. We must do it manually. + new_biases = biases.data.numpy().astype("int64") + zp_shifted_sums + if new_biases.min() < -(2**31) or new_biases.max() > 2**31 - 1: + return None + + new_input_zp = relay.Constant(nd.array(np.int32(0))) + new_conv_args = (*prev_op.args[:2], new_input_zp, *prev_op.args[3:]) + new_conv_op = relay.qnn.op.conv2d(*new_conv_args, **prev_op.attrs) + bias_constant = relay.Constant(nd.array(new_biases.astype("int32"))) + return relay.add(new_conv_op, bias_constant) diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py index caed28580037..222f7a7c223e 100644 --- a/python/tvm/topi/nn/qnn.py +++ b/python/tvm/topi/nn/qnn.py @@ -188,3 +188,51 @@ def _dispatch_sim_dequantize(value): return intn_value return te.compute(data.shape, lambda *indices: _dispatch_sim_dequantize(data)[indices]) + + +@tvm.target.generic_func +def qnn_requantize_alter_layout(_attrs, _inputs, _tinfos, _out_type): + """Change requantize layout. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current convolution + inputs : tvm.relay.Expr + Grouped input symbols + tinfos : list + Input shape and dtype + out_type: type + The output type + + Note + ---- + Unlike other TOPI functions, this function operates on both graph level and operator level. + """ + return None + + +@tvm.target.generic_func +def qnn_add_alter_layout(_attrs, _inputs, _tinfos, _out_type): + """Change add layout. + + Add is not a QNN-specific function, but this generic exists so that bias add operations can be + fused with input zero point add optimizations, which only happens if the previous operator is + quantized. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current convolution + inputs : tvm.relay.Expr + Grouped input symbols + tinfos : list + Input shape and dtype + out_type: type + The output type + + Note + ---- + Unlike other TOPI functions, this function operates on both graph level and operator level. + """ + return None diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc index 64a5a02e6e25..2170ba76e060 100644 --- a/src/relay/qnn/op/convolution.cc +++ b/src/relay/qnn/op/convolution.cc @@ -53,8 +53,9 @@ bool QnnConv2DRel(const Array& types, int num_inputs, const Attrs& attrs, ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8) || data->dtype == DataType::Int(16)) << "Expected qnn conv2d type(int8, uint8, int16) for input but was " << data->dtype; - ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8)) - << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype; + ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8) || + weight->dtype == DataType::Int(16)) + << "Expected qnn conv2d type(int8, uint8, int16) for weight but was " << weight->dtype; ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32) || param->out_dtype == DataType::Int(64)) << "Expected qnn conv2d type(int16, int32, int64) for output but was " << param->out_dtype; diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py index a3e48f4424ad..58173a9ea6c3 100644 --- a/tests/python/contrib/test_ethosn/test_convert_equivalents.py +++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py @@ -120,7 +120,7 @@ def expected(): @requires_ethosn @pytest.mark.parametrize( "dtype,shape,constant_shape", - [("int8", (1, 4, 4), (4,)), ("int16", (1, 16, 12, 4), (1, 1, 1, 4))], + [("int8", (1, 4, 4), (4,)), ("int32", (1, 16, 12, 4), (1, 1, 1, 4))], ) def test_unsupported_multiply_to_depthwise(dtype, shape, constant_shape): """Check that unsupported variants of multiply to depthwise are not converted.""" @@ -339,7 +339,7 @@ def visit_call(self, call): @requires_ethosn @pytest.mark.parametrize( - "dtype,lhs_shape,rhs_shape", [("uint8", (1, 4, 4), (1, 1, 4)), ("int16", (1, 4, 4, 4), (4,))] + "dtype,lhs_shape,rhs_shape", [("uint8", (1, 4, 4), (1, 1, 4)), ("int32", (1, 4, 4, 4), (4,))] ) def test_unsupported_add_to_depthwise(dtype, lhs_shape, rhs_shape): """Check that unsupported variants of add are not converted.""" diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_conv2d.py index 6cf4bbb8e6ed..1b9c1a5e2e94 100644 --- a/tests/python/relay/strategy/arm_cpu/test_conv2d.py +++ b/tests/python/relay/strategy/arm_cpu/test_conv2d.py @@ -93,28 +93,6 @@ class TestConv2d_NHWC_Spatial_Pack(Conv2dTests): schedule_name = parameter("conv2d_nhwc_spatial_pack.arm_cpu") -class TestConv2d_Tensordot(Conv2dTests): - """This test is for the regular conv2d schedule tensorized using tensordot.""" - - data_shape, kernel_size, num_filter, strides, padding = parameters( - # Disabled because these kernels are not an integral number of words - # ((1, 32, 32, 1), (3, 3), 12, 1, 0), - # ((1, 32, 10, 3), (3, 3), 16, 1, 0), - # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1)), - ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0)), - ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0), - ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1)), - ((4, 16, 16, 16), (5, 5), 8, 2, 0), - ) - dilation = parameter(1) - in_dtype = parameter("int8", "int16", "int32") - - data_layout = parameter("NHWC") - kernel_layout = parameter("OHWI") - out_layout = parameter("NHWC", "NCHW") - schedule_name = parameter("conv2d_nhwc_ohwi_dsp.arm_cpu") - - class TestConv2d_NCHW_Spatial_Pack(Conv2dTests): """This test is for conv2d_nchw_spatial_pack.arm_cpu schedule.""" diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py index f45d27bdaee9..95ae105f9166 100644 --- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py +++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py @@ -110,36 +110,5 @@ class TestDepthwiseConv2d_NHWC_HWOI_DSP(DepthwiseConv2dTests): schedule_name = parameter("depthwise_conv2d_nhwc_dsp.arm_cpu") -class TestDepthwiseConv2d_Tensordot(DepthwiseConv2dTests): - """This test is for the depthwise_conv2d schedule tensorized using tensordot.""" - - data_shape, kernel_size, num_filter, strides, padding, in_dtype = parameters( - # Currently, our schedule requires kernel_w be divisible by the number of simd lanes given - # its dtype. This means 3x3 and 5x5 kernels do not work on int16 or int8 for now. If you had - # to, you could hack around this by padding the data and kernel. - ((1, 48, 48, 8), (3, 3), 8, (1, 1), 1, "int32"), - ((1, 48, 48, 16), (3, 3), 16, (2, 2), (1, 1, 0, 0), "int32"), - ((1, 24, 24, 32), (3, 3), 32, (1, 1), 1, "int32"), - ((1, 24, 24, 32), (3, 3), 32, (2, 2), (1, 1, 0, 0), "int32"), - ((1, 12, 12, 64), (3, 3), 64, (1, 1), 1, "int32"), - ((1, 12, 12, 64), (3, 3), 64, (2, 2), (1, 1, 0, 0), "int32"), - ((1, 6, 6, 128), (3, 3), 128, (1, 1), 1, "int32"), - ((1, 6, 6, 128), (3, 3), 128, (2, 2), (1, 1, 0, 0), "int32"), - ((1, 3, 3, 256), (3, 3), 256, (1, 1), 1, "int32"), - ((1, 25, 5, 64), (3, 3), 64, (1, 1), 1, "int32"), - ((1, 24, 24, 8), (5, 5), 8, (1, 1), 1, "int32"), - ((1, 24, 24, 8), (3, 5), 8, (1, 1), 1, "int32"), - # These "evenly divisible" kernels work on smaller dtypes. - ((1, 48, 48, 8), (3, 2), 8, 1, 0, "int16"), - ((1, 48, 48, 8), (4, 4), 8, 1, 0, "int8"), - ) - dilation = parameter(1) - - data_layout = parameter("NCHW") - kernel_layout = parameter("OIHW") - out_layout = parameter("NHWC", "NCHW") - schedule_name = parameter("depthwise_conv2d_nchw_oihw_dsp.arm_cpu") - - if __name__ == "__main__": main() diff --git a/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py index 499d677e8f95..d48c7e138fba 100644 --- a/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py +++ b/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py @@ -26,7 +26,7 @@ from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER -def _change_ndarray_layout(arr, src_layout, dst_layout): +def change_ndarray_layout(arr, src_layout, dst_layout): """Makes a copy of an ndarray, reshaping it to a new data layout. Parameter @@ -96,7 +96,7 @@ def test_conv2d( ref_relay_op = relay.op.nn.conv2d( ref_input_var, - relay.const(_change_ndarray_layout(ref_kernel_data, "HWIO", self.ref_kernel_layout)), + relay.const(change_ndarray_layout(ref_kernel_data, "HWIO", self.ref_kernel_layout)), kernel_size=kernel_size, strides=strides, padding=padding, @@ -113,11 +113,11 @@ def test_conv2d( # Reshape output dictionary to match out_layout assert len(ref_outputs) == 1 output_tensor_name, output_tensor = next(iter(ref_outputs.items())) - ref_outputs[output_tensor_name] = _change_ndarray_layout(output_tensor, "NHWC", out_layout) + ref_outputs[output_tensor_name] = change_ndarray_layout(output_tensor, "NHWC", out_layout) - test_input_data = _change_ndarray_layout(ref_input_data, "NHWC", data_layout) + test_input_data = change_ndarray_layout(ref_input_data, "NHWC", data_layout) test_input_var = relay.var("input", relay.TensorType(test_input_data.shape, in_dtype)) - test_kernel_data = _change_ndarray_layout(ref_kernel_data, "HWIO", kernel_layout) + test_kernel_data = change_ndarray_layout(ref_kernel_data, "HWIO", kernel_layout) test_relay_op = relay.op.nn.conv2d( test_input_var, diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py new file mode 100644 index 000000000000..573231f9632c --- /dev/null +++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py @@ -0,0 +1,358 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""microTVM cares a lot about the convolution + bias + requantize + fused ReLU use case. There have +been some accuracy issues in the past, so this test steps through a model (MobileNetV1) layer by +layer and ensures there is 1-1 correspondance at each step. This test would run way faster if we ran +the model all at once, but then we wouldn't know which layers had issues. + +Furthermore, this test uses some in-development optimizations for microTVM that aren't part of the +main pipeline. +""" + +import numpy as np +from PIL import Image +import pytest + +import tvm +import tvm.testing +from tvm import meta_schedule, relay +from tvm.testing.aot import AOTTestModel, run_and_check, AOTCompiledTestModel +from tvm.relay.backend import Executor, Runtime +from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER +from tvm.contrib.download import download_testdata +from test_generalized_conv2d import change_ndarray_layout + + +# The model is the v0.7 version of the TinyML person detection (aka visual wake words) model. This +# is an RGB 96x96 MobileNet V1 model. +MODEL_URL = "https://github.com/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite" +SAMPLE_URL = ( + "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/elephant-299.jpg" +) + + +@pytest.fixture(scope="module") +def interpreter(): + """Returns a TFLite interpreter with the MLPerf Tiny visual wakewords model loaded, with an + elephant image run through it, and with all intermediate layer outputs saved.""" + + # Make sure the Tensorflow import is skipped if the test is being skipped. This is needed to + # prevent the "python: i386" tests from failing, as they don't have Tensorflow installed. + import tensorflow as tf # pylint: disable=import-outside-toplevel + + # Download the reference model + rel_model_path = "model_microtvm_mobilenetv1.tflite" + file = download_testdata(MODEL_URL, rel_model_path, overwrite=False) + + # Load it into TensorFlow and allocate memory + interpreter = tf.lite.Interpreter(file, experimental_preserve_all_tensors=True) + interpreter.allocate_tensors() + + # Download an image. The neuron activations are strange if we use random data or ones, + # so downloading an image is useful. + rel_image_path = "image_microtvm_mobilenetv1.jpg" + img_path = download_testdata(SAMPLE_URL, rel_image_path, overwrite=False) + image = Image.open(img_path).resize((96, 96)) + image_data_hwc_uint8 = np.asarray(image) + assert image_data_hwc_uint8.shape == (96, 96, 3) + assert image_data_hwc_uint8.dtype == "uint8" + image_data_nhwc_int8 = (image_data_hwc_uint8 + 128).view("int8").reshape((1, 96, 96, 3)) + + # Load the image into the TFLite interpreter and compute all intermediate tensor values + input_details = interpreter.get_input_details() + interpreter.set_tensor(input_details[0]["index"], image_data_nhwc_int8) + interpreter.invoke() + return interpreter + + +def _get_mobilenet_v1_layer_attributes(layer_num): + """Returns the relevant padding and stride for a given layer in a MobileNetV1 model. It's a huge + headache to read this data from TensorFlow, as it is not user accessible via the interpreter. If + we really wanted to, we would have to parse the .tflite file ourselves. This function is a bit + of a hack, but lets us skip that.""" + + if layer_num == 0: # Regular conv2d + return ((0, 0, 1, 1), (2, 2), False) + if layer_num % 2 == 0: # 1x1 conv2d + return ((0, 0, 0, 0), (1, 1), False) + if layer_num in [3, 7, 11, 23]: # Downsizing depthwise_conv2d layers + return ((0, 0, 1, 1), (2, 2), True) + # Depthwise conv2d + return ((1, 1, 1, 1), (1, 1), True) + + +def _get_relu_activation_prefix(layer_num): + if layer_num == 0: + return "model/activation/Relu;" + return f"model/activation_{layer_num}/Relu;" + + +def _get_main_path_tensor_details(details, tensor_num): + """A "main path" tensor is a fused layer input/output. Gets the tensor details from the tensor + index, where 0 gives the original input tensor, 1 gives the output of the first fused + convolution layer, and so on. TFLite names are a little wack, so we get this information by + finding the SECOND tensor (which has the suffix "1") for each ReLU activation (the first tensor + is the bias).""" + + if tensor_num == 0: + return details[0] + prefix = _get_relu_activation_prefix(tensor_num - 1) + detail = next(d for d in details if d["name"].startswith(prefix) and d["name"].endswith("1")) + assert len(detail["shape"]) == 4 + assert detail["dtype"] == np.int8 + return detail + + +def _get_bias_details(details, layer_num): + """Gets the tensor details for the bias tensor for the corresponding convolution layer. The + bias tensors always appear before the main path tensors, so we don't have to check the ending to + make sure we have the right one.""" + prefix = _get_relu_activation_prefix(layer_num) + detail = next(d for d in details if d["name"].startswith(prefix)) + assert len(detail["shape"]) == 1 + assert detail["dtype"] == np.int32 + return detail + + +def _get_kernel_details(details, layer_num): + """Gets the tensor details for the kernel tensor for the corresponding convolution layer. These + have a different naming scheme from the main path and bias tensors, as they are converted before + activation function fusion. Note that regular vs depthwise conv2ds have different prefixes.""" + + if layer_num == 0: + prefix = "model/conv2d/Conv2D" + elif layer_num % 2 == 0: + prefix = f"model/conv2d_{layer_num // 2}/" + else: + prefix = f"model/batch_normalization_{layer_num}/" + + detail = next(d for d in details if d["name"].startswith(prefix)) + assert len(detail["shape"]) == 4 + assert detail["dtype"] == np.int8 + return detail + + +def _get_quant_scale_const(quantization_dict, as_scalar=False): + scales = quantization_dict["scales"] + if as_scalar: + assert len(scales) == 1 + scales = scales[0] + return relay.const(scales, "float32") + + +def _get_quant_zp_const(quantization_dict, as_scalar=False): + zero_points = quantization_dict["zero_points"] + if as_scalar: + assert len(zero_points) == 1 + zero_points = zero_points[0] + return relay.const(zero_points, "int32") + + +def _change_layout(data, old_layout, new_layout, dtype): + return change_ndarray_layout(data, old_layout, new_layout).astype(dtype) + + +def _load_tflite_layer(interpreter, layer): + tensor_details = interpreter.get_tensor_details() + + def lookup(detail): + return interpreter.get_tensor(detail["index"]), detail["quantization_parameters"] + + input_data = lookup(_get_main_path_tensor_details(tensor_details, layer)) + kernel_data = lookup(_get_kernel_details(tensor_details, layer)) + bias_data = lookup(_get_bias_details(tensor_details, layer)) + output_data = lookup(_get_main_path_tensor_details(tensor_details, layer + 1)) + return input_data, kernel_data, bias_data, output_data + + +def _make_relay_partial_func(relay_op, *args, **kwargs): + return lambda op: relay_op(op, *args, **kwargs) + + +def _make_conv2d_op(kernel, data_quant, kernel_quant, hyperparams, is_depthwise=False): + dtype, padding, strides, data_layout, kernel_layout, output_layout = hyperparams + kernel_size = kernel.shape[1:3] + if is_depthwise: + channels = groups = kernel.shape[3] + else: + channels = kernel.shape[0] + groups = 1 + + kernel_ndarr = _change_layout(kernel, "OHWI", kernel_layout, dtype) + + return _make_relay_partial_func( + relay.qnn.op.conv2d, + relay.const(kernel_ndarr, dtype), + input_zero_point=_get_quant_zp_const(data_quant, as_scalar=True), + kernel_zero_point=_get_quant_zp_const(kernel_quant), + input_scale=_get_quant_scale_const(data_quant, as_scalar=True), + kernel_scale=_get_quant_scale_const(kernel_quant), + kernel_size=kernel_size, + data_layout=data_layout, + kernel_layout="IOHW" if is_depthwise else kernel_layout, + dilation=(1, 1), + strides=strides, + padding=padding, + groups=groups, + channels=channels, + out_dtype="int32", + out_layout=output_layout, + ) + + +def _make_bias_op(bias, output_layout): + requantize_axis = output_layout.index("C") + return _make_relay_partial_func( + relay.op.nn.bias_add, + relay.const(bias, "int32"), + axis=requantize_axis, + ) + + +def _make_requantize_op(bias_quant, output_quant, output_dtype, output_layout): + requantize_axis = output_layout.index("C") + return _make_relay_partial_func( + relay.qnn.op.requantize, + _get_quant_scale_const(bias_quant), + _get_quant_zp_const(bias_quant), + _get_quant_scale_const(output_quant, as_scalar=True), + _get_quant_zp_const(output_quant, as_scalar=True), + axis=requantize_axis, + compute_dtype="int64", + out_dtype=output_dtype, + ) + + +def _make_aot_model(params, hyperparams, layouts, is_depthwise=False): + tensors, quantizations = zip(*params) + data, kernel, bias, output = tensors + data_quant, kernel_quant, bias_quant, output_quant = quantizations + + dtype, padding, _strides = hyperparams + data_layout, _, output_layout = layouts + + if any(padding): + pad_const = int(data_quant["zero_points"][0]) + pad_before = (0, padding[0], padding[1], 0) + pad_after = (0, padding[2], padding[3], 0) + data = np.pad(data, tuple(zip(pad_before, pad_after)), constant_values=pad_const) + data_ndarr = _change_layout(data, "NHWC", data_layout, dtype) + output_ndarr = _change_layout(output, "NHWC", output_layout, dtype) + + input_var = relay.var("input", relay.TensorType(data_ndarr.shape, dtype)) + conv2d = _make_conv2d_op(kernel, data_quant, kernel_quant, hyperparams + layouts, is_depthwise) + bias = _make_bias_op(bias, output_layout) + requantize = _make_requantize_op(bias_quant, output_quant, dtype, output_layout) + + relay_mod = requantize(bias(conv2d(input_var))) + relay_func = relay.Function([input_var], relay_mod) + return AOTTestModel( + module=tvm.IRModule.from_expr(relay_func), + inputs={"input": data_ndarr}, + outputs={"output": output_ndarr}, + output_tolerance=1, + ) + + +def _make_target(): + return tvm.target.Target("c -keys=arm_cpu -mcpu=cortex-m7") + + +def _make_executor(): + return Executor( + "aot", + { + "workspace-byte-alignment": 8, + "constant-byte-alignment": 8, + "interface-api": "c", + "unpacked-api": True, + }, + ) + + +@pytest.mark.parametrize("layer", range(23)) +@tvm.testing.requires_corstone300 +def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer): + """Checks microTVM output against TFLite for one MobileNetV1 layer. + + Loads the input, kernel, bias, expected output, and quantization parameters from the specified + layer in a TFLite Interpreter. That information is used to construct a Relay Function with the + same structure. The Function is run using microTVM and AOTTestModel, and we verify microTVM's + output is the same as the TFLite ground truth. + + This function only cross-checks the first 23 layers in MobileNetV1, which are regular and + depthwise 2D convolutions (this function only works for 2D convolutions). We do not test the + average pool, dense, or softmax layers at the end of the model. + + Note that we disable the QNN Legalization pass. This allows TVM to use its QNN compute + definitions, fuse the three operations together, and perform other optimizations. + + Parameters + ---------- + interpreter: tensorflow.lite.python.interpreter.Interpreter + A TensorFlow Lite interpreter for a MobileNetV1 model, where invoke() has already been + called and experimental_preserve_all_tensors=True. Should be passed as a Pytest fixture. + + layer: int + The index of the layer to check against TensorFlow's ground truth values. + """ + dtype = "int16" + + tensor, kernel, bias, output = _load_tflite_layer(interpreter, layer) + + padding, strides, is_depthwise = _get_mobilenet_v1_layer_attributes(layer) + if is_depthwise: + data_layout, kernel_layout, output_layout = "NCHW", "OIHW", "NHWC" + else: + data_layout, kernel_layout, output_layout = "NHWC", "OHWI", "NHWC" + + test_model = _make_aot_model( + (tensor, kernel, bias, output), + (dtype, padding, strides), + (data_layout, kernel_layout, output_layout), + is_depthwise=is_depthwise, + ) + + def schedule_fn(_sch): + return True + + with tvm.transform.PassContext( + opt_level=3, + config={ + "tir.disable_vectorize": True, + "relay.backend.use_meta_schedule": True, + "relay.backend.tir_converter": "allow_extern", + }, + disabled_pass=["qnn.Legalize"], + ), meta_schedule.database.ScheduleFnDatabase(schedule_fn): + executor_factory = tvm.relay.build( + test_model.module, + _make_target(), + executor=_make_executor(), + runtime=Runtime("crt"), + params=test_model.params, + mod_name=test_model.name, + ) + compiled = AOTCompiledTestModel(model=test_model, executor_factory=executor_factory) + + run_and_check( + models=[compiled], + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + workspace_byte_alignment=8, + constant_byte_alignment=8, + ) diff --git a/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py new file mode 100644 index 000000000000..46d2797ba394 --- /dev/null +++ b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py @@ -0,0 +1,415 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Tests for functions in tvm.topi.arm_cpu.mprofile.dsp.micro_kernel.tensordot. + +Contains a few unit tests, followed by integration tests for common use cases. Note that we do not +run the generated code - we just make sure the strings match exactly. + +Note that a *lot* of instruction reordering happens during compilation from C to assembly (by GCC or +Clang). I've verified that this instruction reordering happens correctly for all the functions here. +For more details on why the generated code is the way it is, see `tensordot_int16_impl`.""" + +import textwrap + +from tvm.topi.arm_cpu.mprofile.dsp.micro_kernel.tensordot import ( + _get_tensor_halfwords, + _get_kernel_halfwords, + tensordot_int16_impl, +) + + +def test_get_tensor_halfwords(): + """Tests the _get_tensor_halfwords helper function in tensordot.py. + + This function loads the logical indices of the data that will be stored in memory at the tensor + pointer. See the function docstring for more details. + """ + + # fmt: off + # A simple 3x3 depthwise convolution computing one output and with in_stride = 1. Note that each + # row is padded with None at the end to make the rows word-aligned. + assert _get_tensor_halfwords((48, 3, 3), 0, 1, 1) == [ + (0, 0), (0, 1), (0, 2), None, + (1, 0), (1, 1), (1, 2), None, + (2, 0), (2, 1), (2, 2), None + ] + + # If the tensor width is odd, padding alternates before/after every row. + assert _get_tensor_halfwords((49, 3, 3), 0, 1, 1) == [ + (0, 0), (0, 1), (0, 2), None, + None, (1, 0), (1, 1), (1, 2), + (2, 0), (2, 1), (2, 2), None + ] + + # If we are computing multiple outputs, more tensor data becomes relevant. + assert _get_tensor_halfwords((48, 3, 3), 0, 2, 1) == [ + (0, 0), (0, 1), (0, 2), (0, 3), + (1, 0), (1, 1), (1, 2), (1, 3), + (2, 0), (2, 1), (2, 2), (2, 3) + ] + + # If offset=1, relevant data starts one halfword after the kernel pointer. + assert _get_tensor_halfwords((48, 3, 3), 1, 1, 1) == [ + None, (0, 0), (0, 1), (0, 2), + None, (1, 0), (1, 1), (1, 2), + None, (2, 0), (2, 1), (2, 2) + ] + + # These adjustments can be (and often are) used together. + assert _get_tensor_halfwords((49, 3, 3), 1, 2, 2) == [ + None, (0, 0), (0, 1), (0, 2), (0, 3), (0, 4), + (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), None, + None, (2, 0), (2, 1), (2, 2), (2, 3), (2, 4) + ] + # fmt: on + + +def test_get_kernel_halfwords(): + """Tests the _get_kernel_halfwords helper function in tensordot.py. + + This function loads the logical indices of the data that will be stored in memory at the kernel + pointer. See the function docstring for more details. + """ + + # fmt: off + # Example of a kernel for a 3x3 depthwise convolution channel + assert _get_kernel_halfwords((96, 3, 3), 0) == [ + (0, 0), (0, 1), (0, 2), + (1, 0), (1, 1), (1, 2), + (2, 0), (2, 1), (2, 2), + None, + ] + + # Example of a kernel for a 1x1 regular convolution with 4 channels + assert _get_kernel_halfwords((48, 1, 4), 1) == [ + None, (0, 0), (0, 1), (0, 2), (0, 3), None, + ] + # fmt: on + + +def test_write_3x3_depthwise_code(): + """This is the function that would be generated for a 1x4x48x48 NCHW input tensor with "SAME" + padding. We are only computing one sum at once, so we don't need stride or output. Note that + this is pretty inefficient - it would be much better to compute a few sums concurrently. + + When inlined, this code compiles (with armv7-a clang 11) into: + + tensordot_opt_x1_int16_w48_3x3_000(int*, int*, int*, int*, int*): + ldr.w lr, [r3] + ldrd r11, r4, [r1] + ldrd r5, r9, [r1, #96] + ldrd r10, r8, [r1, #192] + ldm.w r2, {r1, r6, r7} + ldr.w r12, [sp, #36] + smlad r1, r11, r1, lr + smlabb r1, r4, r6, r1 + smlatb r1, r6, r5, r1 + ldrd r3, r2, [r2, #12] + smlatb r1, r5, r7, r1 + smlatb r1, r7, r9, r1 + smlad r1, r10, r3, r1 + ldr.w r3, [r12] + smlabb r1, r8, r2, r1 + smmul r1, r3, r1 + ssat r1, #8, r1, asr #8 + strh r1, [r0] + """ + _, code = tensordot_int16_impl(1, (48, 3, 3), (0, 0, 0), (1, 1)) + assert code == textwrap.dedent( + """ + #ifndef TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS + #define TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS + #include + __attribute__((always_inline)) static inline int32_t tensordot_opt_x1_int16_w48_3x3_000( + int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale + ) { + int32_t sum_0 = *bias; + + int32_t tensor__y00_x00__y00_x01 = tensor[0]; + int32_t tensor__y00_x02__unknown = tensor[1]; + int32_t tensor__y01_x00__y01_x01 = tensor[24]; + int32_t tensor__y01_x02__unknown = tensor[25]; + int32_t tensor__y02_x00__y02_x01 = tensor[48]; + int32_t tensor__y02_x02__unknown = tensor[49]; + + int32_t kernel__y00_x00__y00_x01 = kernel[0]; + int32_t kernel__y00_x02__y01_x00 = kernel[1]; + int32_t kernel__y01_x01__y01_x02 = kernel[2]; + int32_t kernel__y02_x00__y02_x01 = kernel[3]; + int32_t kernel__y02_x02__unknown = kernel[4]; + + sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0); + sum_0 = __smlabb(tensor__y00_x02__unknown, kernel__y00_x02__y01_x00, sum_0); + sum_0 = __smlabt(tensor__y01_x00__y01_x01, kernel__y00_x02__y01_x00, sum_0); + sum_0 = __smlatb(tensor__y01_x00__y01_x01, kernel__y01_x01__y01_x02, sum_0); + sum_0 = __smlabt(tensor__y01_x02__unknown, kernel__y01_x01__y01_x02, sum_0); + sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0); + sum_0 = __smlabb(tensor__y02_x02__unknown, kernel__y02_x02__unknown, sum_0); + + int32_t scale_val = *scale; + int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32; + requant_0 = (requant_0 + 1) >> 1; + requant_0 = __ssat(requant_0 + -128, 8); + + ((int16_t*) output)[0] = (int16_t) requant_0; + return 0; + } + #endif + """ + ) + + +def test_odd_width_3x3_depthwise_strides_code(): + """This is the function that would be generated for a 1x4x48x48 NCHW input tensor with "SAME" + padding and (2, 2) strides, being written into NHWC layout. The layout change is encoded by + out_stride = 4. This is a common use case seen in MobileNetV1, among others. + + Note that despite the rows not being word-aligned, the *tensor pointer will always be word + aligned (satisfying this requirement) since y_stride = 2.""" + + _, code = tensordot_int16_impl(2, (49, 3, 3), (0, 0, 0), (2, 4)) + assert code == textwrap.dedent( + """ + #ifndef TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS + #define TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS + #include + __attribute__((always_inline)) static inline int32_t tensordot_opt_x2_int16_w49_3x3_000_2_4( + int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale + ) { + int32_t sum_0 = *bias, sum_1 = *bias; + + int32_t tensor__y00_x00__y00_x01 = tensor[0]; + int32_t tensor__y00_x02__y00_x03 = tensor[1]; + int32_t tensor__y00_x04__unknown = tensor[2]; + int32_t tensor__unknown__y01_x00 = tensor[24]; + int32_t tensor__y01_x01__y01_x02 = tensor[25]; + int32_t tensor__y01_x03__y01_x04 = tensor[26]; + int32_t tensor__y02_x00__y02_x01 = tensor[49]; + int32_t tensor__y02_x02__y02_x03 = tensor[50]; + int32_t tensor__y02_x04__unknown = tensor[51]; + + int32_t kernel__y00_x00__y00_x01 = kernel[0]; + int32_t kernel__y00_x02__y01_x00 = kernel[1]; + int32_t kernel__y01_x01__y01_x02 = kernel[2]; + int32_t kernel__y02_x00__y02_x01 = kernel[3]; + int32_t kernel__y02_x02__unknown = kernel[4]; + + sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0); + sum_0 = __smlabb(tensor__y00_x02__y00_x03, kernel__y00_x02__y01_x00, sum_0); + sum_0 = __smlatt(tensor__unknown__y01_x00, kernel__y00_x02__y01_x00, sum_0); + sum_0 = __smlad(tensor__y01_x01__y01_x02, kernel__y01_x01__y01_x02, sum_0); + sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, sum_0); + sum_0 = __smlabb(tensor__y02_x02__y02_x03, kernel__y02_x02__unknown, sum_0); + sum_1 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x00__y00_x01, sum_1); + sum_1 = __smlabb(tensor__y00_x04__unknown, kernel__y00_x02__y01_x00, sum_1); + sum_1 = __smlatt(tensor__y01_x01__y01_x02, kernel__y00_x02__y01_x00, sum_1); + sum_1 = __smlad(tensor__y01_x03__y01_x04, kernel__y01_x01__y01_x02, sum_1); + sum_1 = __smlad(tensor__y02_x02__y02_x03, kernel__y02_x00__y02_x01, sum_1); + sum_1 = __smlabb(tensor__y02_x04__unknown, kernel__y02_x02__unknown, sum_1); + + int32_t scale_val = *scale; + int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32; + requant_0 = (requant_0 + 1) >> 1; + requant_0 = __ssat(requant_0 + -128, 8); + int32_t requant_1 = (sum_1 * (int64_t) scale_val) >> 32; + requant_1 = (requant_1 + 1) >> 1; + requant_1 = __ssat(requant_1 + -128, 8); + + ((int16_t*) output)[0] = (int16_t) requant_0; + ((int16_t*) output)[4] = (int16_t) requant_1; + return 0; + } + #endif + """ + ) + + +def test_1x1x8_convolution_code(): + """This is the function that would be generated for a 1x48x48x8 NHWC input tensor under + standard convolution with a 1x1 kernel. This is a common use case seen in MobileNetV1, + among others. In this scenario, a very high amount of memory re-use means that summing + four channels at once makes us faster.""" + + _, code = tensordot_int16_impl(4, (48 * 8, 1, 8), (0, 0, 0), (8, 1)) + assert code == textwrap.dedent( + """ + #ifndef TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS + #define TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS + #include + __attribute__((always_inline)) static inline int32_t tensordot_opt_x4_int16_w384_1x8_000_8_1( + int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale + ) { + int32_t sum_0 = *bias, sum_1 = *bias, sum_2 = *bias, sum_3 = *bias; + + int32_t tensor__y00_x00__y00_x01 = tensor[0]; + int32_t tensor__y00_x02__y00_x03 = tensor[1]; + int32_t tensor__y00_x04__y00_x05 = tensor[2]; + int32_t tensor__y00_x06__y00_x07 = tensor[3]; + int32_t tensor__y00_x08__y00_x09 = tensor[4]; + int32_t tensor__y00_x0a__y00_x0b = tensor[5]; + int32_t tensor__y00_x0c__y00_x0d = tensor[6]; + int32_t tensor__y00_x0e__y00_x0f = tensor[7]; + int32_t tensor__y00_x10__y00_x11 = tensor[8]; + int32_t tensor__y00_x12__y00_x13 = tensor[9]; + int32_t tensor__y00_x14__y00_x15 = tensor[10]; + int32_t tensor__y00_x16__y00_x17 = tensor[11]; + int32_t tensor__y00_x18__y00_x19 = tensor[12]; + int32_t tensor__y00_x1a__y00_x1b = tensor[13]; + int32_t tensor__y00_x1c__y00_x1d = tensor[14]; + int32_t tensor__y00_x1e__y00_x1f = tensor[15]; + + int32_t kernel__y00_x00__y00_x01 = kernel[0]; + int32_t kernel__y00_x02__y00_x03 = kernel[1]; + int32_t kernel__y00_x04__y00_x05 = kernel[2]; + int32_t kernel__y00_x06__y00_x07 = kernel[3]; + + sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, sum_0); + sum_0 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x02__y00_x03, sum_0); + sum_0 = __smlad(tensor__y00_x04__y00_x05, kernel__y00_x04__y00_x05, sum_0); + sum_0 = __smlad(tensor__y00_x06__y00_x07, kernel__y00_x06__y00_x07, sum_0); + sum_1 = __smlad(tensor__y00_x08__y00_x09, kernel__y00_x00__y00_x01, sum_1); + sum_1 = __smlad(tensor__y00_x0a__y00_x0b, kernel__y00_x02__y00_x03, sum_1); + sum_1 = __smlad(tensor__y00_x0c__y00_x0d, kernel__y00_x04__y00_x05, sum_1); + sum_1 = __smlad(tensor__y00_x0e__y00_x0f, kernel__y00_x06__y00_x07, sum_1); + sum_2 = __smlad(tensor__y00_x10__y00_x11, kernel__y00_x00__y00_x01, sum_2); + sum_2 = __smlad(tensor__y00_x12__y00_x13, kernel__y00_x02__y00_x03, sum_2); + sum_2 = __smlad(tensor__y00_x14__y00_x15, kernel__y00_x04__y00_x05, sum_2); + sum_2 = __smlad(tensor__y00_x16__y00_x17, kernel__y00_x06__y00_x07, sum_2); + sum_3 = __smlad(tensor__y00_x18__y00_x19, kernel__y00_x00__y00_x01, sum_3); + sum_3 = __smlad(tensor__y00_x1a__y00_x1b, kernel__y00_x02__y00_x03, sum_3); + sum_3 = __smlad(tensor__y00_x1c__y00_x1d, kernel__y00_x04__y00_x05, sum_3); + sum_3 = __smlad(tensor__y00_x1e__y00_x1f, kernel__y00_x06__y00_x07, sum_3); + + int32_t scale_val = *scale; + int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32; + requant_0 = (requant_0 + 1) >> 1; + requant_0 = __ssat(requant_0 + -128, 8); + int32_t requant_1 = (sum_1 * (int64_t) scale_val) >> 32; + requant_1 = (requant_1 + 1) >> 1; + requant_1 = __ssat(requant_1 + -128, 8); + int32_t requant_2 = (sum_2 * (int64_t) scale_val) >> 32; + requant_2 = (requant_2 + 1) >> 1; + requant_2 = __ssat(requant_2 + -128, 8); + int32_t requant_3 = (sum_3 * (int64_t) scale_val) >> 32; + requant_3 = (requant_3 + 1) >> 1; + requant_3 = __ssat(requant_3 + -128, 8); + + int32_t packed_res_0 = requant_0 + (requant_1 << 16); + int32_t packed_res_1 = requant_2 + (requant_3 << 16); + output[0] = packed_res_0; + output[1] = packed_res_1; + return 0; + } + #endif + """ + ) + + +def test_3x3x3_offset_convolution_code(): + """This is the function that would be generated for a 1x96x96x3 NHWC input tensor under + standard convolution with a 3x3x3 kernel - the first layer of MobileNetV1. This is special, as + it means that every other kernel channel will not start on an even numbered halfword. We won't + have this issue for the input tensor, as we will always compute two positions at a time. + + To solve this 'every other' issue, we will need two different version of this function to + alternate between. This alternation will be handled in TIR scheduling. Here, we just test the + version where the kernel is not word aligned. + + Also tests the requantize_shift and output_zero_point keyword args. These might be needed for + some ResNet models (like image classification from MLPerf Tiny). + """ + + _, code = tensordot_int16_impl( + 1, + (96 * 3, 3, 9), + (1, 1, 1), + (3, 1), + requantize_shift=40, + output_zero_point=4, + ) + assert code == textwrap.dedent( + """ + #ifndef TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS + #define TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS + #include + __attribute__((always_inline)) static inline int32_t tensordot_opt_x1_int16_w288_3x9_111( + int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, int32_t *scale + ) { + int32_t sum_0 = *bias; + + int32_t tensor__unknown__y00_x00 = tensor[0]; + int32_t tensor__y00_x01__y00_x02 = tensor[1]; + int32_t tensor__y00_x03__y00_x04 = tensor[2]; + int32_t tensor__y00_x05__y00_x06 = tensor[3]; + int32_t tensor__y00_x07__y00_x08 = tensor[4]; + int32_t tensor__unknown__y01_x00 = tensor[144]; + int32_t tensor__y01_x01__y01_x02 = tensor[145]; + int32_t tensor__y01_x03__y01_x04 = tensor[146]; + int32_t tensor__y01_x05__y01_x06 = tensor[147]; + int32_t tensor__y01_x07__y01_x08 = tensor[148]; + int32_t tensor__unknown__y02_x00 = tensor[288]; + int32_t tensor__y02_x01__y02_x02 = tensor[289]; + int32_t tensor__y02_x03__y02_x04 = tensor[290]; + int32_t tensor__y02_x05__y02_x06 = tensor[291]; + int32_t tensor__y02_x07__y02_x08 = tensor[292]; + + int32_t kernel__unknown__y00_x00 = kernel[0]; + int32_t kernel__y00_x01__y00_x02 = kernel[1]; + int32_t kernel__y00_x03__y00_x04 = kernel[2]; + int32_t kernel__y00_x05__y00_x06 = kernel[3]; + int32_t kernel__y00_x07__y00_x08 = kernel[4]; + int32_t kernel__y01_x00__y01_x01 = kernel[5]; + int32_t kernel__y01_x02__y01_x03 = kernel[6]; + int32_t kernel__y01_x04__y01_x05 = kernel[7]; + int32_t kernel__y01_x06__y01_x07 = kernel[8]; + int32_t kernel__y01_x08__y02_x00 = kernel[9]; + int32_t kernel__y02_x01__y02_x02 = kernel[10]; + int32_t kernel__y02_x03__y02_x04 = kernel[11]; + int32_t kernel__y02_x05__y02_x06 = kernel[12]; + int32_t kernel__y02_x07__y02_x08 = kernel[13]; + + sum_0 = __smlatt(tensor__unknown__y00_x00, kernel__unknown__y00_x00, sum_0); + sum_0 = __smlad(tensor__y00_x01__y00_x02, kernel__y00_x01__y00_x02, sum_0); + sum_0 = __smlad(tensor__y00_x03__y00_x04, kernel__y00_x03__y00_x04, sum_0); + sum_0 = __smlad(tensor__y00_x05__y00_x06, kernel__y00_x05__y00_x06, sum_0); + sum_0 = __smlad(tensor__y00_x07__y00_x08, kernel__y00_x07__y00_x08, sum_0); + sum_0 = __smlatb(tensor__unknown__y01_x00, kernel__y01_x00__y01_x01, sum_0); + sum_0 = __smlabt(tensor__y01_x01__y01_x02, kernel__y01_x00__y01_x01, sum_0); + sum_0 = __smlatb(tensor__y01_x01__y01_x02, kernel__y01_x02__y01_x03, sum_0); + sum_0 = __smlabt(tensor__y01_x03__y01_x04, kernel__y01_x02__y01_x03, sum_0); + sum_0 = __smlatb(tensor__y01_x03__y01_x04, kernel__y01_x04__y01_x05, sum_0); + sum_0 = __smlabt(tensor__y01_x05__y01_x06, kernel__y01_x04__y01_x05, sum_0); + sum_0 = __smlatb(tensor__y01_x05__y01_x06, kernel__y01_x06__y01_x07, sum_0); + sum_0 = __smlabt(tensor__y01_x07__y01_x08, kernel__y01_x06__y01_x07, sum_0); + sum_0 = __smlatb(tensor__y01_x07__y01_x08, kernel__y01_x08__y02_x00, sum_0); + sum_0 = __smlatt(tensor__unknown__y02_x00, kernel__y01_x08__y02_x00, sum_0); + sum_0 = __smlad(tensor__y02_x01__y02_x02, kernel__y02_x01__y02_x02, sum_0); + sum_0 = __smlad(tensor__y02_x03__y02_x04, kernel__y02_x03__y02_x04, sum_0); + sum_0 = __smlad(tensor__y02_x05__y02_x06, kernel__y02_x05__y02_x06, sum_0); + sum_0 = __smlad(tensor__y02_x07__y02_x08, kernel__y02_x07__y02_x08, sum_0); + + int32_t scale_val = *scale; + int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 39; + requant_0 = (requant_0 + 1) >> 1; + requant_0 = __ssat(requant_0 + 4, 8); + + ((int16_t*) output)[1] = (int16_t) requant_0; + return 0; + } + #endif + """ + ) diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py index ce379b6b2cb3..cb24353539a4 100644 --- a/tests/scripts/request_hook/request_hook.py +++ b/tests/scripts/request_hook/request_hook.py @@ -145,6 +145,7 @@ "https://github.com/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy": f"{BASE}/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy", "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy": f"{BASE}/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy", "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite": f"{BASE}/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite", + "https://github.com/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite": f"{BASE}/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite", "https://github.com/uwsampl/web-data/raw/main/vta/models/synset.txt": f"{BASE}/2022-10-05/synset.txt", "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_cora.torch": f"{BASE}/gcn_cora.torch", "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg": f"{BASE}/vta_cat.jpg",