diff --git a/example.py b/example.py index 9c707111..5050b25e 100644 --- a/example.py +++ b/example.py @@ -601,6 +601,28 @@ def core(self, slothy): slothy.optimize_loop("layer123_start") slothy.optimize_loop("layer4567_start") +class intt_kyber_123_4567(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): + name = "intt_kyber_123_4567" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout) + + def core(self, slothy): + slothy.config.sw_pipelining.enabled = True + slothy.config.inputs_are_outputs = True + slothy.config.sw_pipelining.minimize_overlapping = False + slothy.config.variable_size = True + slothy.config.reserved_regs = [f"x{i}" for i in range(0, 7)] + ["x30", "sp"] + slothy.config.constraints.stalls_first_attempt = 64 + slothy.optimize_loop("layer4567_start") + slothy.optimize_loop("layer123_start") + class ntt_kyber_123(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): @@ -1030,6 +1052,39 @@ def core(self, slothy): slothy.optimize_loop("layer45678_start") +class intt_dilithium_123_45678(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55, timeout=None): + name = f"intt_dilithium_123_45678" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout) + + def core(self, slothy): + slothy.config.sw_pipelining.enabled = True + slothy.config.sw_pipelining.minimize_overlapping = False + slothy.config.inputs_are_outputs = True + + slothy.config.reserved_regs = [ + f"x{i}" for i in range(0, 7)] + ["v8", "x30", "sp"] + slothy.config.reserved_regs += self.target_reserved + slothy.config.constraints.stalls_first_attempt = 40 + slothy.optimize_loop("layer45678_start") + + slothy.config.reserved_regs = [ + f"x{i}" for i in range(0, 7)] + ["v8", "x30", "sp"] + slothy.config.reserved_regs += self.target_reserved + slothy.config.inputs_are_outputs = True + slothy.config.constraints.stalls_first_attempt = 110 + slothy.optimize_loop("layer123_start") + + + + class ntt_dilithium_123(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): name = "ntt_dilithium_123" @@ -1124,6 +1179,51 @@ def core(self, slothy): slothy.optimize_loop("layer5678_start") +class intt_dilithium_1234_5678(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72, timeout=None): + name = f"intt_dilithium_1234_5678" + infile = name + + if var != "": + name += f"_{var}" + infile += f"_{var}" + name += f"_{target_label_dict[target]}" + + super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout) + + def core(self, slothy): + conf = slothy.config.copy() + + slothy.config.reserved_regs = [ + f"x{i}" for i in range(0, 6)] + ["x30", "sp"] + slothy.config.inputs_are_outputs = True + slothy.config.reserved_regs += self.target_reserved + slothy.config.sw_pipelining.enabled = True + slothy.config.sw_pipelining.minimize_overlapping = False + slothy.config.sw_pipelining.halving_heuristic = False + slothy.config.split_heuristic = False + slothy.optimize_loop("layer5678_start") + + slothy.config = conf.copy() + + if self.timeout is not None: + slothy.config.timeout = self.timeout // 12 + + slothy.config.sw_pipelining.enabled = True + slothy.config.sw_pipelining.minimize_overlapping = False + slothy.config.reserved_regs = [ + f"x{i}" for i in range(0, 6)] + ["x30", "sp"] + slothy.config.reserved_regs += self.target_reserved + slothy.config.inputs_are_outputs = True + slothy.config.sw_pipelining.halving_heuristic = True + slothy.config.split_heuristic = True + slothy.config.split_heuristic_factor = 2 + slothy.config.split_heuristic_repeat = 4 + slothy.config.split_heuristic_stepsize = 0.1 + slothy.config.constraints.stalls_first_attempt = 14 + slothy.optimize_loop("layer1234_start") + + class ntt_dilithium_1234(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA72): name = "ntt_dilithium_1234" @@ -1296,6 +1396,8 @@ def main(): ntt_kyber_123_4567(var="scalar_load_store"), ntt_kyber_123_4567(var="manual_st4"), ntt_kyber_1234_567(), + intt_kyber_123_4567(), + intt_kyber_123_4567(var="manual_ld4"), # Cortex-A72 ntt_kyber_123_4567(target=Target_CortexA72), ntt_kyber_123_4567(var="scalar_load", target=Target_CortexA72), @@ -1303,6 +1405,8 @@ def main(): ntt_kyber_123_4567(var="scalar_load_store", target=Target_CortexA72), ntt_kyber_123_4567(var="manual_st4", target=Target_CortexA72), ntt_kyber_1234_567(target=Target_CortexA72), + intt_kyber_123_4567(target=Target_CortexA72), + intt_kyber_123_4567(var="manual_ld4", target=Target_CortexA72), # # Apple M1 Firestorm ntt_kyber_123_4567(target=Target_AppleM1_firestorm, timeout=3600), ntt_kyber_123_4567(var="scalar_load", target=Target_AppleM1_firestorm, timeout=3600), @@ -1311,6 +1415,8 @@ def main(): ntt_kyber_123_4567(var="manual_st4", target=Target_AppleM1_firestorm, timeout=3600), ntt_kyber_1234_567(target=Target_AppleM1_firestorm, timeout=300), ntt_kyber_1234_567(var="manual_st4", target=Target_AppleM1_firestorm, timeout=300), + intt_kyber_123_4567(target=Target_AppleM1_firestorm, timeout=3600), + intt_kyber_123_4567(var="manual_ld4", target=Target_AppleM1_firestorm, timeout=3600), # Apple M1 Icestorm ntt_kyber_123_4567(target=Target_AppleM1_icestorm, timeout=3600), ntt_kyber_123_4567(var="scalar_load", target=Target_AppleM1_icestorm, timeout=3600), @@ -1319,6 +1425,8 @@ def main(): ntt_kyber_123_4567(var="manual_st4", target=Target_AppleM1_icestorm, timeout=3600), ntt_kyber_1234_567(target=Target_AppleM1_icestorm, timeout=300), ntt_kyber_1234_567(var="manual_st4", target=Target_AppleM1_icestorm, timeout=300), + intt_kyber_123_4567(target=Target_AppleM1_icestorm, timeout=3600), + intt_kyber_123_4567(var="manual_ld4", target=Target_AppleM1_icestorm, timeout=3600), # Kyber InvNTT # Cortex-M55 intt_kyber_1_23_45_67(), @@ -1340,24 +1448,40 @@ def main(): ntt_dilithium_123_45678(var="manual_st4"), ntt_dilithium_1234_5678(), ntt_dilithium_1234_5678(var="manual_st4"), + intt_dilithium_123_45678(), + intt_dilithium_123_45678(var="manual_ld4"), + intt_dilithium_1234_5678(), + intt_dilithium_1234_5678(var="manual_ld4"), # Cortex-A72 ntt_dilithium_123_45678(target=Target_CortexA72), ntt_dilithium_123_45678(var="w_scalar", target=Target_CortexA72), ntt_dilithium_123_45678(var="manual_st4", target=Target_CortexA72), ntt_dilithium_1234_5678(target=Target_CortexA72), ntt_dilithium_1234_5678(var="manual_st4", target=Target_CortexA72), + intt_dilithium_123_45678(target=Target_CortexA72), + intt_dilithium_123_45678(var="manual_ld4", target=Target_CortexA72), + intt_dilithium_1234_5678(target=Target_CortexA72), + intt_dilithium_1234_5678(var="manual_ld4", target=Target_CortexA72), # Apple M1 Firestorm - ntt_dilithium_123_45678(target=Target_AppleM1_firestorm, timeout=3600), + ntt_dilithium_123_45678(target=Target_AppleM1_firestorm, timeout=3600), ntt_dilithium_123_45678(var="w_scalar", target=Target_AppleM1_firestorm, timeout=3600), ntt_dilithium_123_45678(var="manual_st4", target=Target_AppleM1_firestorm, timeout=3600), ntt_dilithium_1234_5678(target=Target_AppleM1_firestorm, timeout=300), ntt_dilithium_1234_5678(var="manual_st4", target=Target_AppleM1_firestorm, timeout=300), + intt_dilithium_123_45678(target=Target_AppleM1_firestorm, timeout=3600), + intt_dilithium_123_45678(var="manual_ld4", target=Target_AppleM1_firestorm, timeout=3600), + intt_dilithium_1234_5678(target=Target_AppleM1_firestorm, timeout=3600), + intt_dilithium_1234_5678(var="manual_ld4", target=Target_AppleM1_firestorm, timeout=3600), # Apple M1 Icestorm ntt_dilithium_123_45678(target=Target_AppleM1_icestorm, timeout=3600), ntt_dilithium_123_45678(var="w_scalar", target=Target_AppleM1_icestorm, timeout=3600), ntt_dilithium_123_45678(var="manual_st4", target=Target_AppleM1_icestorm, timeout=3600), ntt_dilithium_1234_5678(target=Target_AppleM1_icestorm, timeout=300), ntt_dilithium_1234_5678(var="manual_st4", target=Target_AppleM1_icestorm, timeout=300), + intt_dilithium_123_45678(target=Target_AppleM1_icestorm, timeout=3600), + intt_dilithium_123_45678(var="manual_ld4", target=Target_AppleM1_icestorm, timeout=3600), + intt_dilithium_1234_5678(target=Target_AppleM1_icestorm, timeout=3600), + intt_dilithium_1234_5678(var="manual_ld4", target=Target_AppleM1_icestorm, timeout=3600), # Dilithium invNTT # Cortex-M55 intt_dilithium_12_34_56_78(), diff --git a/examples/misc/gen_roots.py b/examples/misc/gen_roots.py index ebf63201..47148a32 100644 --- a/examples/misc/gen_roots.py +++ b/examples/misc/gen_roots.py @@ -405,6 +405,13 @@ def _main(): ntt_kyber_l123.export("../naive/ntt_kyber_123_45_67_twiddles.s") ntt_kyber_l123.export("../opt/ntt_kyber_123_45_67_twiddles.s") + # For intt_kyber_123_4567.s + intt_kyber_l123 = NttRootGen(size=256,modulus=3329,root=17,layers=7,iters=[(0,3),(3,2),(5,2)], + pad=[0,3], print_label=True, widen_single_twiddles_to_words=False, + inverse=True) + intt_kyber_l123.export("../naive/aarch64/intt_kyber_123_45_67_twiddles.s") + intt_kyber_l123.export("../opt/aarch64/intt_kyber_123_45_67_twiddles.s") + ntt_kyber = NttRootGen(size=256,modulus=3329,root=17,layers=7) ntt_kyber.export("../naive/ntt_kyber_1_23_45_67_twiddles.s") ntt_kyber.export("../opt/ntt_kyber_1_23_45_67_twiddles.s") @@ -428,6 +435,11 @@ def _main(): ntt_dilithium_l123.export("../naive/ntt_dilithium_123_456_78_twiddles.s") ntt_dilithium_l123.export("../opt/ntt_dilithium_123_456_78_twiddles.s") + intt_dilithium_l123 = NttRootGen(size=256,inverse=True,bitsize=32,modulus=8380417,root=1753,layers=8, + print_label=True, pad=[0,3], iters=[(0,3),(3,3),(6,2)]) + intt_dilithium_l123.export("../naive/aarch64/intt_dilithium_123_456_78_twiddles.s") + intt_dilithium_l123.export("../opt/aarch64/intt_dilithium_123_456_78_twiddles.s") + ntt_dilithium_l123 = NttRootGen(size=256,bitsize=32,modulus=8380417,root=1753,layers=8, print_label=True, pad=[0,3], iters=[(0,3),(3,3),(6,2)]) ntt_dilithium_l123.export("../naive/aarch64/ntt_dilithium_123_456_78_twiddles.s") diff --git a/examples/naive/aarch64/intt_dilithium_1234_5678.s b/examples/naive/aarch64/intt_dilithium_1234_5678.s index b3743103..1e8008fe 100644 --- a/examples/naive/aarch64/intt_dilithium_1234_5678.s +++ b/examples/naive/aarch64/intt_dilithium_1234_5678.s @@ -85,18 +85,18 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmls \dst, \src, modulus + vmls \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus .endm -.macro montg_reduce a +.macro barrett_reduce_single a srshr tmp.4S, \a\().4S, #23 vmls \a, tmp, modulus .endm @@ -114,12 +114,6 @@ mulmodq \b, tmp, \root, \idx0, \idx1 .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmls \dst, \src, modulus -.endm - .macro gs_butterfly_v a, b, root, root_twisted vsub tmp, \a, \b vadd \a, \a, \b @@ -235,6 +229,12 @@ restore_gprs .endm +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + .data .p2align 4 roots: @@ -334,10 +334,14 @@ _intt_dilithium_1234_5678: .p2align 2 layer5678_start: - ldr_vo data0, inp, (16*0) - ldr_vo data1, inp, (16*1) - ldr_vo data2, inp, (16*2) - ldr_vo data3, inp, (16*3) + // manual_ld4 + // ldr_vo data0, inp, (16*0) + // ldr_vo data1, inp, (16*1) + // ldr_vo data2, inp, (16*2) + // ldr_vo data3, inp, (16*3) + // transpose4 data + + ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp] load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr0 @@ -356,8 +360,8 @@ layer5678_start: gs_butterfly data0, data2, root1, 0, 1 gs_butterfly data1, data3, root1, 0, 1 - montg_reduce data0 - montg_reduce data1 + barrett_reduce_single data0 + barrett_reduce_single data1 str_vi data0, inp, (16*4) str_vo data1, inp, (-16*4 + 1*16) @@ -482,25 +486,28 @@ layer1234_start: str_vo data14, in, (14*(512/8)) str_vo data15, in, (15*(512/8)) - mul_ninv data8, data9, data10, data11, data12, data13, data14, data15, data0, data1, data2, data3, data4, data5, data6, data7 - - canonical_reduce data8, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data9, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 - canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 - - str_vi data8, in, (16) - str_vo data9, in, (-16 + 1*(512/8)) - str_vo data10, in, (-16 + 2*(512/8)) - str_vo data11, in, (-16 + 3*(512/8)) - str_vo data12, in, (-16 + 4*(512/8)) - str_vo data13, in, (-16 + 5*(512/8)) - str_vo data14, in, (-16 + 6*(512/8)) - str_vo data15, in, (-16 + 7*(512/8)) + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7 + + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) + str_vo data4, in, (-16 + 4*(512/8)) + str_vo data5, in, (-16 + 5*(512/8)) + str_vo data6, in, (-16 + 6*(512/8)) + str_vo data7, in, (-16 + 7*(512/8)) // layer1234_end: subs count, count, #1 diff --git a/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s b/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s new file mode 100644 index 00000000..22d24757 --- /dev/null +++ b/examples/naive/aarch64/intt_dilithium_1234_5678_manual_ld4.s @@ -0,0 +1,515 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmls \dst, t2, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmls \dst, t2, modulus +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_manual_ld4 + .global _intt_dilithium_1234_5678_manual_ld4 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_manual_ld4: +_intt_dilithium_1234_5678_manual_ld4: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 +layer5678_start: + // manual_ld4 + ldr_vo data0, inp, (16*0) + ldr_vo data1, inp, (16*1) + ldr_vo data2, inp, (16*2) + ldr_vo data3, inp, (16*3) + transpose4 data + + load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr0 + + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + transpose4 data + + load_next_roots_6 root1, r_ptr1 + load_next_roots_56 root0, r_ptr1 + + gs_butterfly data0, data1, root0, 0, 1 + gs_butterfly data2, data3, root0, 2, 3 + gs_butterfly data0, data2, root1, 0, 1 + gs_butterfly data1, data3, root1, 0, 1 + + barrett_reduce_single data0 + barrett_reduce_single data1 + + str_vi data0, inp, (16*4) + str_vo data1, inp, (-16*4 + 1*16) + str_vo data2, inp, (-16*4 + 2*16) + str_vo data3, inp, (-16*4 + 3*16) +// layer5678_end: + subs count, count, #1 + cbnz count, layer5678_start + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 +layer1234_start: + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(512/8)) + ldr_vo data2, in, (2*(512/8)) + ldr_vo data3, in, (3*(512/8)) + ldr_vo data4, in, (4*(512/8)) + ldr_vo data5, in, (5*(512/8)) + ldr_vo data6, in, (6*(512/8)) + ldr_vo data7, in, (7*(512/8)) + ldr_vo data8, in, (8*(512/8)) + ldr_vo data9, in, (9*(512/8)) + ldr_vo data10, in, (10*(512/8)) + ldr_vo data11, in, (11*(512/8)) + ldr_vo data12, in, (12*(512/8)) + ldr_vo data13, in, (13*(512/8)) + ldr_vo data14, in, (14*(512/8)) + ldr_vo data15, in, (15*(512/8)) + + // layer4 + gs_butterfly data0, data1, root3, 2, 3 + gs_butterfly data2, data3, root4, 0, 1 + gs_butterfly data4, data5, root4, 2, 3 + gs_butterfly data6, data7, root5, 0, 1 + gs_butterfly data8, data9, root5, 2, 3 + gs_butterfly data10, data11, root6, 0, 1 + gs_butterfly data12, data13, root6, 2, 3 + gs_butterfly data14, data15, root7, 0, 1 + + // layer3 + gs_butterfly data0, data2, root1, 2, 3 + gs_butterfly data1, data3, root1, 2, 3 + gs_butterfly data4, data6, root2, 0, 1 + gs_butterfly data5, data7, root2, 0, 1 + gs_butterfly data8, data10, root2, 2, 3 + gs_butterfly data9, data11, root2, 2, 3 + gs_butterfly data12, data14, root3, 0, 1 + gs_butterfly data13, data15, root3, 0, 1 + + // layer2 + gs_butterfly data0, data4, root0, 2, 3 + gs_butterfly data1, data5, root0, 2, 3 + gs_butterfly data2, data6, root0, 2, 3 + gs_butterfly data3, data7, root0, 2, 3 + gs_butterfly data8, data12, root1, 0, 1 + gs_butterfly data9, data13, root1, 0, 1 + gs_butterfly data10, data14, root1, 0, 1 + gs_butterfly data11, data15, root1, 0, 1 + + // layer 1 + gs_butterfly data0, data8, root0, 0, 1 + gs_butterfly data1, data9, root0, 0, 1 + gs_butterfly data2, data10, root0, 0, 1 + gs_butterfly data3, data11, root0, 0, 1 + gs_butterfly data4, data12, root0, 0, 1 + gs_butterfly data5, data13, root0, 0, 1 + gs_butterfly data6, data14, root0, 0, 1 + gs_butterfly data7, data15, root0, 0, 1 + + canonical_reduce data8, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data9, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data10, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data11, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data12, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data13, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data14, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data15, modulus_half, neg_modulus_half, t2, t3 + + str_vo data8, in, (8*(512/8)) + str_vo data9, in, (9*(512/8)) + str_vo data10, in, (10*(512/8)) + str_vo data11, in, (11*(512/8)) + str_vo data12, in, (12*(512/8)) + str_vo data13, in, (13*(512/8)) + str_vo data14, in, (14*(512/8)) + str_vo data15, in, (15*(512/8)) + + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data4, data5, data6, data7, data0, data1, data2, data3, data4, data5, data6, data7 + + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) + str_vo data4, in, (-16 + 4*(512/8)) + str_vo data5, in, (-16 + 5*(512/8)) + str_vo data6, in, (-16 + 6*(512/8)) + str_vo data7, in, (-16 + 7*(512/8)) + +// layer1234_end: + subs count, count, #1 + cbnz count, layer1234_start + + pop_stack + ret diff --git a/examples/naive/aarch64/intt_dilithium_123_45678.s b/examples/naive/aarch64/intt_dilithium_123_45678.s new file mode 100644 index 00000000..598a1a9c --- /dev/null +++ b/examples/naive/aarch64/intt_dilithium_123_45678.s @@ -0,0 +1,542 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmlsq \dst, t2, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmlsq \dst, t2, consts, 0 +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, consts +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678 + .global _intt_dilithium_123_45678 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678: +_intt_dilithium_123_45678: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 +layer45678_start: + // Standard way using vector instructions + ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp] + ld4 {data4.4S, data5.4S, data6.4S, data7.4S}, [inpp] + + load_roots_78_part1 + + // Layer 8 Part 1 + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + // Layer 7 Part 1 + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + load_roots_78_part2 + + // Layer 8 Part 2 + gs_butterfly_v data4, data5, root1, root1_tw + gs_butterfly_v data6, data7, root2, root2_tw + // Layer 7 Part 2 + gs_butterfly_v data4, data6, root0, root0_tw + gs_butterfly_v data5, data7, root0, root0_tw + + transpose4 data0, data1, data2, data3 + transpose4 data4, data5, data6, data7 + + load_roots_456 + + // Layer 6 + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + // Layer 5 + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // Interm. Reduction + barrett_reduce_single data0 + barrett_reduce_single data1 + barrett_reduce_single data4 + barrett_reduce_single data5 + + // Layer 4 + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + // Standard way using vector instructions + + str_vi data0, inp, (16*4) + str_vo data1, inp, (-16*4 + 1*16) + str_vo data2, inp, (-16*4 + 2*16) + str_vo data3, inp, (-16*4 + 3*16) + + str_vi data4, inpp, (16*4) + str_vo data5, inpp, (-16*4 + 1*16) + str_vo data6, inpp, (-16*4 + 2*16) + str_vo data7, inpp, (-16*4 + 3*16) + + add inp, inp, #64 + add inpp, inpp, #64 + + subs count, count, #1 + cbnz count, layer45678_start + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 +layer123_start: + + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(1024/8)) + ldr_vo data2, in, (2*(1024/8)) + ldr_vo data3, in, (3*(1024/8)) + ldr_vo data4, in, (4*(1024/8)) + ldr_vo data5, in, (5*(1024/8)) + ldr_vo data6, in, (6*(1024/8)) + ldr_vo data7, in, (7*(1024/8)) + + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // root0[0] includes ninv, manually computed. + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + + str_vo data4, in, (4*(1024/8)) + str_vo data5, in, (5*(1024/8)) + str_vo data6, in, (6*(1024/8)) + str_vo data7, in, (7*(1024/8)) + + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 + + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(1024/8)) + str_vo data2, in, (-16 + 2*(1024/8)) + str_vo data3, in, (-16 + 3*(1024/8)) + + subs count, count, #1 + cbnz count, layer123_start + + pop_stack + ret diff --git a/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s b/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s new file mode 100644 index 00000000..69dc2c2a --- /dev/null +++ b/examples/naive/aarch64/intt_dilithium_123_45678_manual_ld4.s @@ -0,0 +1,551 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmlsq \dst, t2, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vmlsq \dst, t2, consts, 0 +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, consts +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are canonically reduced. The ordering of the coefficients is canonical, also +// matching PQClean. + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_manual_ld4 + .global _intt_dilithium_123_45678_manual_ld4 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_manual_ld4: +_intt_dilithium_123_45678_manual_ld4: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 +layer45678_start: + // Manual ld4 using vector instructions + ldr_vo data0, inp, 0 + ldr_vo data1, inp, 16 + ldr_vo data2, inp, 32 + ldr_vo data3, inp, 48 + transpose4 data0, data1, data2, data3 + + ldr_vo data4, inpp, 0 + ldr_vo data5, inpp, 16 + ldr_vo data6, inpp, 32 + ldr_vo data7, inpp, 48 + transpose4 data4, data5, data6, data7 + + load_roots_78_part1 + + // Layer 8 Part 1 + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + // Layer 7 Part 1 + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + load_roots_78_part2 + + // Layer 8 Part 2 + gs_butterfly_v data4, data5, root1, root1_tw + gs_butterfly_v data6, data7, root2, root2_tw + // Layer 7 Part 2 + gs_butterfly_v data4, data6, root0, root0_tw + gs_butterfly_v data5, data7, root0, root0_tw + + transpose4 data0, data1, data2, data3 + transpose4 data4, data5, data6, data7 + + load_roots_456 + + // Layer 6 + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + // Layer 5 + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // Interm. Reduction + barrett_reduce_single data0 + barrett_reduce_single data1 + barrett_reduce_single data4 + barrett_reduce_single data5 + + // Layer 4 + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + // Standard way using vector instructions + + str_vi data0, inp, (16*4) + str_vo data1, inp, (-16*4 + 1*16) + str_vo data2, inp, (-16*4 + 2*16) + str_vo data3, inp, (-16*4 + 3*16) + + str_vi data4, inpp, (16*4) + str_vo data5, inpp, (-16*4 + 1*16) + str_vo data6, inpp, (-16*4 + 2*16) + str_vo data7, inpp, (-16*4 + 3*16) + + add inp, inp, #64 + add inpp, inpp, #64 + + subs count, count, #1 + cbnz count, layer45678_start + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 +layer123_start: + + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(1024/8)) + ldr_vo data2, in, (2*(1024/8)) + ldr_vo data3, in, (3*(1024/8)) + ldr_vo data4, in, (4*(1024/8)) + ldr_vo data5, in, (5*(1024/8)) + ldr_vo data6, in, (6*(1024/8)) + ldr_vo data7, in, (7*(1024/8)) + + gs_butterfly data0, data1, root1, 2, 3 + gs_butterfly data2, data3, root2, 0, 1 + gs_butterfly data4, data5, root2, 2, 3 + gs_butterfly data6, data7, root3, 0, 1 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root1, 0, 1 + gs_butterfly data5, data7, root1, 0, 1 + + // root0[0] includes ninv, manually computed. + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + canonical_reduce data4, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data5, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data6, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data7, modulus_half, neg_modulus_half, t2, t3 + + str_vo data4, in, (4*(1024/8)) + str_vo data5, in, (5*(1024/8)) + str_vo data6, in, (6*(1024/8)) + str_vo data7, in, (7*(1024/8)) + + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 + + canonical_reduce data0, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data1, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data2, modulus_half, neg_modulus_half, t2, t3 + canonical_reduce data3, modulus_half, neg_modulus_half, t2, t3 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(1024/8)) + str_vo data2, in, (-16 + 2*(1024/8)) + str_vo data3, in, (-16 + 3*(1024/8)) + + subs count, count, #1 + cbnz count, layer123_start + + pop_stack + ret diff --git a/examples/naive/aarch64/intt_dilithium_123_456_78_twiddles.s b/examples/naive/aarch64/intt_dilithium_123_456_78_twiddles.s new file mode 100644 index 00000000..43e0d175 --- /dev/null +++ b/examples/naive/aarch64/intt_dilithium_123_456_78_twiddles.s @@ -0,0 +1,557 @@ + +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +roots_l67: +.word -1744507 +.word 2236726 +.word 1922253 +.word 3818627 +.word -447030292 +.word 573161516 +.word 492577742 +.word 978523985 +.word 731434 +.word 781875 +.word 3773731 +.word -3531229 +.word 187430119 +.word 200355636 +.word 967019376 +.word -904878186 +.word -1054478 +.word -1900052 +.word 3974485 +.word 303005 +.word -270210213 +.word -486888731 +.word 1018462631 +.word 77645096 +.word 2354215 +.word -1011223 +.word 327848 +.word -348812 +.word 603268097 +.word -259126110 +.word 84011120 +.word -89383150 +.word 392707 +.word 1716814 +.word 2193087 +.word -3123762 +.word 100631253 +.word 439933955 +.word 561979013 +.word -800464680 +.word -2926054 +.word 3014420 +.word -2358373 +.word 2185084 +.word -749801963 +.word 772445769 +.word -604333585 +.word 559928242 +.word 459163 +.word 653275 +.word -2312838 +.word 3467665 +.word 117660617 +.word 167401858 +.word -592665232 +.word 888589898 +.word 1514152 +.word -3430436 +.word 553718 +.word 1103344 +.word 388001774 +.word -879049958 +.word 141890356 +.word 282732136 +.word -140244 +.word -860144 +.word -508145 +.word -3105558 +.word -35937555 +.word -220412084 +.word -130212265 +.word -795799901 +.word 2778788 +.word -2683270 +.word 2775755 +.word -1356448 +.word 712065019 +.word -687588511 +.word 711287812 +.word -347590090 +.word 770441 +.word -214880 +.word -3020393 +.word 11879 +.word 197425671 +.word -55063046 +.word -773976352 +.word 3043996 +.word -545376 +.word -3363542 +.word 1370517 +.word -3994671 +.word -139752717 +.word -861908357 +.word 351195274 +.word -1023635298 +.word -3374250 +.word -2925816 +.word 1226661 +.word -3901472 +.word -864652284 +.word -749740976 +.word 314332144 +.word -999753034 +.word 3369273 +.word -2028038 +.word -1723229 +.word -2569011 +.word 863376927 +.word -519685171 +.word -441577800 +.word -658309618 +.word -1163598 +.word -1665318 +.word 1615530 +.word -3980599 +.word -298172236 +.word -426738094 +.word 413979908 +.word -1020029345 +.word -621164 +.word -3035980 +.word -2461387 +.word 1317678 +.word -159173408 +.word -777970524 +.word -630730945 +.word 337655269 +.word 4022750 +.word -4148469 +.word -3009748 +.word 338420 +.word 1030830548 +.word -1063046068 +.word -771248568 +.word 86720197 +.word -749577 +.word 2612853 +.word -2647994 +.word 3033742 +.word -192079267 +.word 669544140 +.word -678549029 +.word 777397036 +.word 2362063 +.word 1300016 +.word 4182915 +.word -3482206 +.word 605279149 +.word 333129378 +.word 1071872863 +.word -892316032 +.word 1834526 +.word 1187885 +.word 1393159 +.word -1994046 +.word 470097680 +.word 304395785 +.word 356997292 +.word -510974714 +.word 724804 +.word -507927 +.word -2491325 +.word 1476985 +.word 185731180 +.word -130156402 +.word -638402564 +.word 378477722 +.word 2254727 +.word 2391089 +.word -1787943 +.word 2579253 +.word 577774276 +.word 612717067 +.word -458160776 +.word 660934133 +.word 2743411 +.word 1179613 +.word 2033807 +.word -2105286 +.word 702999655 +.word 302276083 +.word 521163479 +.word -539479988 +.word -527981 +.word -586241 +.word 2374402 +.word 1623354 +.word -135295244 +.word -150224382 +.word 608441020 +.word 415984810 +.word -3258457 +.word 3250154 +.word -235407 +.word -1736313 +.word -834980303 +.word 832852657 +.word -60323094 +.word -444930577 +.word 2178965 +.word 1879878 +.word 3472069 +.word 1921994 +.word 558360247 +.word 481719139 +.word 889718424 +.word 492511373 +.word 818761 +.word -2039144 +.word -4040196 +.word 458740 +.word 209807681 +.word -522531086 +.word -1035301089 +.word 117552223 +.word 3197248 +.word -1987814 +.word 3488383 +.word 4166425 +.word 819295484 +.word -509377762 +.word 893898890 +.word 1067647297 +.word 2218467 +.word -613238 +.word -2513018 +.word -141835 +.word 568482643 +.word -157142369 +.word -643961400 +.word -36345249 +.word 1310261 +.word 1354892 +.word 89301 +.word -2998219 +.word 335754661 +.word 347191365 +.word 22883400 +.word -768294260 +.word 3334383 +.word -2462444 +.word -169688 +.word 565603 +.word 854436357 +.word -631001801 +.word -43482586 +.word 144935890 +.word 12417 +.word -2642980 +.word 3838479 +.word -2296099 +.word 3181859 +.word -677264190 +.word 983611064 +.word -588375860 +.word -1254190 +.word -3195676 +.word -1239911 +.word -3747250 +.word -321386456 +.word -818892658 +.word -317727459 +.word -960233614 +.word 2962264 +.word -1148858 +.word -482649 +.word -1528066 +.word 759080783 +.word -294395108 +.word -123678909 +.word -391567239 +.word 3180456 +.word 3611750 +.word 1727088 +.word 1772588 +.word 814992530 +.word 925511710 +.word 442566669 +.word 454226054 +.word 268456 +.word -2387513 +.word -2192938 +.word 4146264 +.word 68791907 +.word -611800717 +.word -561940831 +.word 1062481036 +.word -4158088 +.word 1109516 +.word 2983781 +.word -2811291 +.word -1065510939 +.word 284313712 +.word 764594519 +.word -720393920 +.word 2455377 +.word -635956 +.word 3768948 +.word 3410568 +.word 629190881 +.word -162963861 +.word 965793731 +.word 873958779 +.word 250446 +.word 3551006 +.word -2678278 +.word 1685153 +.word 64176841 +.word 909946047 +.word -686309310 +.word 431820817 +.word 3815725 +.word -1937570 +.word -2028118 +.word -2508980 +.word 977780347 +.word -496502727 +.word -519705671 +.word -642926661 +.word 3759465 +.word -1596822 +.word 2454145 +.word -822541 +.word 963363710 +.word -409185979 +.word 628875181 +.word -210776307 +.word 3956944 +.word 1979497 +.word -1009365 +.word 27812 +.word 1013967746 +.word 507246529 +.word -258649997 +.word 7126831 +.word 274060 +.word 3121440 +.word 3222807 +.word -4183372 +.word 70227934 +.word 799869667 +.word 825844983 +.word -1071989969 +.word 3716946 +.word 2296397 +.word 3965306 +.word -87208 +.word 952468207 +.word 588452222 +.word 1016110510 +.word -22347069 +.word 3284915 +.word 3956745 +.word -636927 +.word -1182243 +.word 841760171 +.word 1013916752 +.word -163212680 +.word -302950022 +.word -3852015 +.word 2635473 +.word -1277625 +.word -3073009 +.word -987079667 +.word 675340520 +.word -327391679 +.word -787459213 +.word -2772600 +.word 1780227 +.word 1455890 +.word 1935420 +.word -710479343 +.word 456183549 +.word 373072124 +.word 495951789 +.word 59148 +.word -2660408 +.word 2659525 +.word -1753 +.word 15156688 +.word -681730119 +.word 681503850 +.word -449207 +roots_l345: +.word 1221177 +.word 312926867 +.word -2283733 +.word -585207070 +.word -2815639 +.word -721508096 +.word -1858416 +.word -476219497 +.word -3345963 +.word -857403734 +.word -1853806 +.word -475038184 +.word -2917338 +.word -747568486 +.word 0 +.word 0 +.word -557458 +.word -142848732 +.word 3585098 +.word 918682129 +.word 642628 +.word 164673562 +.word -3870317 +.word -991769559 +.word -556856 +.word -142694469 +.word -3192354 +.word -818041395 +.word 2897314 +.word 742437332 +.word 0 +.word 0 +.word 1005239 +.word 257592709 +.word -1460718 +.word -374309300 +.word -2453983 +.word -628833668 +.word 3950053 +.word 1012201926 +.word 1716988 +.word 439978542 +.word 1935799 +.word 496048908 +.word -3756790 +.word -962678241 +.word 0 +.word 0 +.word -3764867 +.word -964747974 +.word -1714295 +.word -439288460 +.word 3227876 +.word 827143915 +.word 3574466 +.word 915957677 +.word 817536 +.word 209493775 +.word -1759347 +.word -450833045 +.word -3415069 +.word -875112161 +.word 0 +.word 0 +.word -2129892 +.word -545785280 +.word 1335936 +.word 342333886 +.word -676590 +.word -173376332 +.word -2156050 +.word -552488273 +.word -3241972 +.word -830756018 +.word 4018989 +.word 1029866791 +.word -2071829 +.word -530906624 +.word 0 +.word 0 +.word -2682288 +.word -687336873 +.word 434125 +.word 111244624 +.word 3524442 +.word 903139016 +.word 3506380 +.word 898510625 +.word -1095468 +.word -280713909 +.word -928749 +.word -237992130 +.word -394148 +.word -101000509 +.word 0 +.word 0 +.word -3542485 +.word -907762539 +.word 1674615 +.word 429120452 +.word -2663378 +.word -682491182 +.word -1159875 +.word -297218217 +.word -3704823 +.word -949361686 +.word -2101410 +.word -538486762 +.word 3110818 +.word 797147778 +.word 0 +.word 0 +.word 601683 +.word 154181397 +.word 4063053 +.word 1041158200 +.word 3370349 +.word 863652652 +.word 3586446 +.word 919027554 +.word -2740543 +.word -702264730 +.word -3182878 +.word -815613168 +.word -3602218 +.word -923069133 +.word 0 +.word 0 +roots_l012: +.word -294725 +.word -75523344 +.word -3761513 +.word -963888510 +.word -3765607 +.word -964937599 +.word 3201430 +.word 820367122 +.word 3145678 +.word 806080660 +.word 2883726 +.word 738955404 +.word 3201494 +.word 820383522 +.word 0 +.word 0 \ No newline at end of file diff --git a/examples/naive/aarch64/intt_kyber_123_4567.s b/examples/naive/aarch64/intt_kyber_123_4567.s new file mode 100644 index 00000000..9cb0b6c4 --- /dev/null +++ b/examples/naive/aarch64/intt_kyber_123_4567.s @@ -0,0 +1,461 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmlsq \dst, t2, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vmlsq \dst, t2, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567 + .global _intt_kyber_123_4567 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567: +_intt_kyber_123_4567: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 +layer4567_start: + ldr_vo data0, inp, (16*0) + ldr_vo data1, inp, (16*1) + ldr_vo data2, inp, (16*2) + ldr_vo data3, inp, (16*3) + + transpose4 data // manual ld4 + + load_next_roots_67 + + // Layer 7 + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + // Layer 6 + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + transpose4 data + + load_next_roots_45 + + // Layer 5 + gs_butterfly data0, data1, root0, 2, 3 + gs_butterfly data2, data3, root0, 4, 5 + + barrett_reduce data0 + barrett_reduce data2 + barrett_reduce data1 + barrett_reduce data3 + + // Layer 4 + gs_butterfly data0, data2, root0, 0, 1 + gs_butterfly data1, data3, root0, 0, 1 + + str_vi data0, inp, (64) + str_vo data1, inp, (-64 + 16*1) + str_vo data2, inp, (-64 + 16*2) + str_vo data3, inp, (-64 + 16*3) + + subs count, count, #1 + cbnz count, layer4567_start + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + +layer123_start: + + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(512/8)) + ldr_vo data2, in, (2*(512/8)) + ldr_vo data3, in, (3*(512/8)) + ldr_vo data4, in, (4*(512/8)) + ldr_vo data5, in, (5*(512/8)) + ldr_vo data6, in, (6*(512/8)) + ldr_vo data7, in, (7*(512/8)) + + gs_butterfly data0, data1, root0, 6, 7 + gs_butterfly data2, data3, root1, 0, 1 + gs_butterfly data4, data5, root1, 2, 3 + gs_butterfly data6, data7, root1, 4, 5 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root0, 4, 5 + gs_butterfly data5, data7, root0, 4, 5 + + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + str_vo data4, in, (4*(512/8)) + str_vo data5, in, (5*(512/8)) + str_vo data6, in, (6*(512/8)) + str_vo data7, in, (7*(512/8)) + + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) + + + subs count, count, #1 + cbnz count, layer123_start + + pop_stack + ret diff --git a/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s b/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s new file mode 100644 index 00000000..2bcc941d --- /dev/null +++ b/examples/naive/aarch64/intt_kyber_123_4567_manual_ld4.s @@ -0,0 +1,456 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. +// +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 + vmulq \dst, \src, \const, \idx0 + vmlsq \dst, t2, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vmlsq \dst, t2, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +// For comparability reasons, the output range for the coefficients of this +// invNTT code is supposed to match the implementation from PQClean on commit +// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients +// are NOT canonically reduced. The ordering of the coefficients is canonical, +// also matching PQClean. + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_manual_ld4 + .global _intt_kyber_123_4567_manual_ld4 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_manual_ld4: +_intt_kyber_123_4567_manual_ld4: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 +layer4567_start: + ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp] + + load_next_roots_67 + + // Layer 7 + gs_butterfly_v data0, data1, root1, root1_tw + gs_butterfly_v data2, data3, root2, root2_tw + // Layer 6 + gs_butterfly_v data0, data2, root0, root0_tw + gs_butterfly_v data1, data3, root0, root0_tw + + transpose4 data + + load_next_roots_45 + + // Layer 5 + gs_butterfly data0, data1, root0, 2, 3 + gs_butterfly data2, data3, root0, 4, 5 + + barrett_reduce data0 + barrett_reduce data1 + barrett_reduce data2 + barrett_reduce data3 + + // Layer 4 + gs_butterfly data0, data2, root0, 0, 1 + gs_butterfly data1, data3, root0, 0, 1 + + str_vi data0, inp, (64) + str_vo data1, inp, (-64 + 16*1) + str_vo data2, inp, (-64 + 16*2) + str_vo data3, inp, (-64 + 16*3) + + subs count, count, #1 + cbnz count, layer4567_start + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + +layer123_start: + + ldr_vo data0, in, 0 + ldr_vo data1, in, (1*(512/8)) + ldr_vo data2, in, (2*(512/8)) + ldr_vo data3, in, (3*(512/8)) + ldr_vo data4, in, (4*(512/8)) + ldr_vo data5, in, (5*(512/8)) + ldr_vo data6, in, (6*(512/8)) + ldr_vo data7, in, (7*(512/8)) + + gs_butterfly data0, data1, root0, 6, 7 + gs_butterfly data2, data3, root1, 0, 1 + gs_butterfly data4, data5, root1, 2, 3 + gs_butterfly data6, data7, root1, 4, 5 + + gs_butterfly data0, data2, root0, 2, 3 + gs_butterfly data1, data3, root0, 2, 3 + gs_butterfly data4, data6, root0, 4, 5 + gs_butterfly data5, data7, root0, 4, 5 + + gs_butterfly data0, data4, root0, 0, 1 + gs_butterfly data1, data5, root0, 0, 1 + gs_butterfly data2, data6, root0, 0, 1 + gs_butterfly data3, data7, root0, 0, 1 + + str_vo data4, in, (4*(512/8)) + str_vo data5, in, (5*(512/8)) + str_vo data6, in, (6*(512/8)) + str_vo data7, in, (7*(512/8)) + + // Scale half the coeffs by 1/n; for the other half, the scaling has + // been merged into the multiplication with the twiddle factor on the + // last layer. + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 + + str_vi data0, in, (16) + str_vo data1, in, (-16 + 1*(512/8)) + str_vo data2, in, (-16 + 2*(512/8)) + str_vo data3, in, (-16 + 3*(512/8)) + + + subs count, count, #1 + cbnz count, layer123_start + + pop_stack + ret diff --git a/examples/naive/aarch64/intt_kyber_123_45_67_twiddles.s b/examples/naive/aarch64/intt_kyber_123_45_67_twiddles.s new file mode 100644 index 00000000..1e50a61d --- /dev/null +++ b/examples/naive/aarch64/intt_kyber_123_45_67_twiddles.s @@ -0,0 +1,494 @@ + +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +roots_l56: +.short -910 +.short -910 +.short -1227 +.short -1227 +.short 219 +.short 219 +.short 855 +.short 855 +.short -8957 +.short -8957 +.short -12078 +.short -12078 +.short 2156 +.short 2156 +.short 8416 +.short 8416 +.short 1175 +.short 1175 +.short 394 +.short 394 +.short -1029 +.short -1029 +.short -1212 +.short -1212 +.short 11566 +.short 11566 +.short 3878 +.short 3878 +.short -10129 +.short -10129 +.short -11930 +.short -11930 +.short -885 +.short -885 +.short 1219 +.short 1219 +.short 1455 +.short 1455 +.short 1607 +.short 1607 +.short -8711 +.short -8711 +.short 11999 +.short 11999 +.short 14322 +.short 14322 +.short 15818 +.short 15818 +.short -648 +.short -648 +.short -1481 +.short -1481 +.short 712 +.short 712 +.short 682 +.short 682 +.short -6378 +.short -6378 +.short -14578 +.short -14578 +.short 7008 +.short 7008 +.short 6713 +.short 6713 +.short -886 +.short -886 +.short 1179 +.short 1179 +.short -1026 +.short -1026 +.short -1092 +.short -1092 +.short -8721 +.short -8721 +.short 11605 +.short 11605 +.short -10099 +.short -10099 +.short -10749 +.short -10749 +.short 554 +.short 554 +.short -1143 +.short -1143 +.short -403 +.short -403 +.short 525 +.short 525 +.short 5453 +.short 5453 +.short -11251 +.short -11251 +.short -3967 +.short -3967 +.short 5168 +.short 5168 +.short 927 +.short 927 +.short -1534 +.short -1534 +.short 461 +.short 461 +.short -1438 +.short -1438 +.short 9125 +.short 9125 +.short -15099 +.short -15099 +.short 4538 +.short 4538 +.short -14155 +.short -14155 +.short 735 +.short 735 +.short -561 +.short -561 +.short -757 +.short -757 +.short -319 +.short -319 +.short 7235 +.short 7235 +.short -5522 +.short -5522 +.short -7451 +.short -7451 +.short -3140 +.short -3140 +.short 863 +.short 863 +.short 1230 +.short 1230 +.short 556 +.short 556 +.short -1063 +.short -1063 +.short 8495 +.short 8495 +.short 12107 +.short 12107 +.short 5473 +.short 5473 +.short -10463 +.short -10463 +.short -452 +.short -452 +.short -807 +.short -807 +.short -1435 +.short -1435 +.short 1010 +.short 1010 +.short -4449 +.short -4449 +.short -7943 +.short -7943 +.short -14125 +.short -14125 +.short 9942 +.short 9942 +.short -1645 +.short -1645 +.short 780 +.short 780 +.short 109 +.short 109 +.short 1031 +.short 1031 +.short -16192 +.short -16192 +.short 7678 +.short 7678 +.short 1073 +.short 1073 +.short 10148 +.short 10148 +.short 1239 +.short 1239 +.short -375 +.short -375 +.short 1292 +.short 1292 +.short -1584 +.short -1584 +.short 12196 +.short 12196 +.short -3691 +.short -3691 +.short 12717 +.short 12717 +.short -15592 +.short -15592 +.short 1414 +.short 1414 +.short -1320 +.short -1320 +.short -33 +.short -33 +.short 464 +.short 464 +.short 13918 +.short 13918 +.short -12993 +.short -12993 +.short -325 +.short -325 +.short 4567 +.short 4567 +.short -641 +.short -641 +.short 992 +.short 992 +.short 941 +.short 941 +.short 1021 +.short 1021 +.short -6309 +.short -6309 +.short 9764 +.short 9764 +.short 9262 +.short 9262 +.short 10050 +.short 10050 +.short -268 +.short -268 +.short -733 +.short -733 +.short 892 +.short 892 +.short -939 +.short -939 +.short -2638 +.short -2638 +.short -7215 +.short -7215 +.short 8780 +.short 8780 +.short -9243 +.short -9243 +.short -632 +.short -632 +.short 816 +.short 816 +.short 1352 +.short 1352 +.short -650 +.short -650 +.short -6221 +.short -6221 +.short 8032 +.short 8032 +.short 13308 +.short 13308 +.short -6398 +.short -6398 +.short 642 +.short 642 +.short -952 +.short -952 +.short 1540 +.short 1540 +.short -1651 +.short -1651 +.short 6319 +.short 6319 +.short -9371 +.short -9371 +.short 15159 +.short 15159 +.short -16251 +.short -16251 +.short -1461 +.short -1461 +.short 1482 +.short 1482 +.short 540 +.short 540 +.short 1626 +.short 1626 +.short -14381 +.short -14381 +.short 14588 +.short 14588 +.short 5315 +.short 5315 +.short 16005 +.short 16005 +.short 1274 +.short 1274 +.short 1052 +.short 1052 +.short 1025 +.short 1025 +.short -1197 +.short -1197 +.short 12540 +.short 12540 +.short 10355 +.short 10355 +.short 10089 +.short 10089 +.short -11782 +.short -11782 +.short 279 +.short 279 +.short 1173 +.short 1173 +.short -233 +.short -233 +.short 667 +.short 667 +.short 2746 +.short 2746 +.short 11546 +.short 11546 +.short -2293 +.short -2293 +.short 6565 +.short 6565 +.short 314 +.short 314 +.short -756 +.short -756 +.short 48 +.short 48 +.short -1409 +.short -1409 +.short 3091 +.short 3091 +.short -7441 +.short -7441 +.short 472 +.short 472 +.short -13869 +.short -13869 +.short 1573 +.short 1573 +.short 76 +.short 76 +.short -331 +.short -331 +.short -289 +.short -289 +.short 15483 +.short 15483 +.short 748 +.short 748 +.short -3258 +.short -3258 +.short -2845 +.short -2845 +.short -1100 +.short -1100 +.short -723 +.short -723 +.short 680 +.short 680 +.short 568 +.short 568 +.short -10828 +.short -10828 +.short -7117 +.short -7117 +.short 6693 +.short 6693 +.short 5591 +.short 5591 +.short 1041 +.short 1041 +.short -1637 +.short -1637 +.short -583 +.short -583 +.short -17 +.short -17 +.short 10247 +.short 10247 +.short -16113 +.short -16113 +.short -5739 +.short -5739 +.short -167 +.short -167 +roots_l34: +.short 1583 +.short 15582 +.short -821 +.short -8081 +.short 1355 +.short 13338 +.short 0 +.short 0 +.short -569 +.short -5601 +.short 450 +.short 4429 +.short 936 +.short 9213 +.short 0 +.short 0 +.short 69 +.short 679 +.short 447 +.short 4400 +.short -535 +.short -5266 +.short 0 +.short 0 +.short 543 +.short 5345 +.short 1235 +.short 12156 +.short -1426 +.short -14036 +.short 0 +.short 0 +.short -797 +.short -7845 +.short -1333 +.short -13121 +.short 1089 +.short 10719 +.short 0 +.short 0 +.short -193 +.short -1900 +.short -56 +.short -551 +.short 283 +.short 2786 +.short 0 +.short 0 +.short 1410 +.short 13879 +.short -1476 +.short -14529 +.short -1339 +.short -13180 +.short 0 +.short 0 +.short -1062 +.short -10453 +.short 882 +.short 8682 +.short -296 +.short -2914 +.short 0 +.short 0 +roots_l012: +// layer 0 root modified to include ninv +.short 266 // originally: 1600 +.short 2618 // originally: 15749 +.short 40 +.short 394 +.short 749 +.short 7373 +.short -848 +.short -8347 +.short 1432 +.short 14095 +.short -630 +.short -6201 +.short 687 +.short 6762 +.short 0 +.short 0 \ No newline at end of file diff --git a/examples/naive/aarch64/ntt_dilithium_1234_5678.s b/examples/naive/aarch64/ntt_dilithium_1234_5678.s index 2709e634..c607f61a 100644 --- a/examples/naive/aarch64/ntt_dilithium_1234_5678.s +++ b/examples/naive/aarch64/ntt_dilithium_1234_5678.s @@ -68,15 +68,15 @@ .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -85,12 +85,6 @@ add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, modulus -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s diff --git a/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s b/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s index f793d2b4..2f5d42a8 100644 --- a/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s +++ b/examples/naive/aarch64/ntt_dilithium_1234_5678_manual_st4.s @@ -67,15 +67,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmla \dst, \src, modulus + vmla \dst, t2, modulus .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlaq \dst, \src, modulus, 0 + vmla \dst, t2, modulus .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +84,6 @@ add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlaq \dst, \src, modulus, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678.s b/examples/naive/aarch64/ntt_dilithium_123_45678.s index d0a2b946..b460c4cf 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678.s @@ -47,15 +47,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -64,12 +64,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s b/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s index 0b31709a..75c1274b 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_manual_st4.s @@ -47,15 +47,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -64,12 +64,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_red.s b/examples/naive/aarch64/ntt_dilithium_123_45678_red.s index 316b6f93..e8c63491 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_red.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_red.s @@ -48,15 +48,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -65,12 +65,6 @@ add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s index 39249343..209d6473 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar.s @@ -57,15 +57,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -74,12 +74,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s diff --git a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s index 706cdf30..519251cc 100644 --- a/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s +++ b/examples/naive/aarch64/ntt_dilithium_123_45678_w_scalar_red.s @@ -53,15 +53,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -70,12 +70,6 @@ xtmp1 .req x11 add \a\().4s, \a\().4s, tmp.4s .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().4s, \src\().4s, \const\().4s - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().4s, \a\().4s, tmp.4s diff --git a/examples/naive/aarch64/ntt_kyber_1234_567.s b/examples/naive/aarch64/ntt_kyber_1234_567.s index 9b43d6fe..7c211a3b 100644 --- a/examples/naive/aarch64/ntt_kyber_1234_567.s +++ b/examples/naive/aarch64/ntt_kyber_1234_567.s @@ -73,15 +73,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlaq \dst, \src, consts, 0 + vmlaq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlaq \dst, \src, consts, 0 + vmlaq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -90,12 +90,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, consts -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h diff --git a/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s b/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s index 8da669ae..c0dc85fd 100644 --- a/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s +++ b/examples/naive/aarch64/ntt_kyber_1234_567_manual_st4.s @@ -74,15 +74,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlaq \dst, \src, consts, 0 + vmlaq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlaq \dst, \src, consts, 0 + vmlaq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -91,12 +91,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmla \dst, \src, consts -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h diff --git a/examples/naive/aarch64/ntt_kyber_123_4567.s b/examples/naive/aarch64/ntt_kyber_123_4567.s index 778841da..b6bc21a7 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567.s @@ -67,15 +67,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +84,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s b/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s index 694d06a9..c699d9ab 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_manual_st4.s @@ -67,15 +67,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -84,12 +84,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s index 781b0494..face162b 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load.s @@ -79,15 +79,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -96,12 +96,6 @@ xtmp1 .req x11 add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s index f55806cd..90beaa50 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_load_store.s @@ -83,15 +83,15 @@ xtmp1 .req x11 .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -100,12 +100,6 @@ xtmp1 .req x11 add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h diff --git a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s index e7cdb089..523fe947 100644 --- a/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s +++ b/examples/naive/aarch64/ntt_kyber_123_4567_scalar_store.s @@ -66,15 +66,15 @@ .endm .macro mulmodq dst, src, const, idx0, idx1 + vqrdmulhq t2, \src, \const, \idx1 vmulq \dst, \src, \const, \idx0 - vqrdmulhq \src, \src, \const, \idx1 - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro mulmod dst, src, const, const_twisted + vqrdmulh t2, \src, \const_twisted mul \dst\().8h, \src\().8h, \const\().8h - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 + vmlsq \dst, t2, consts, 0 .endm .macro ct_butterfly a, b, root, idx0, idx1 @@ -83,12 +83,6 @@ add \a\().8h, \a\().8h, tmp.8h .endm -.macro mulmod_v dst, src, const, const_twisted - vmul \dst, \src, \const - vqrdmulh \src, \src, \const_twisted - vmlsq \dst, \src, consts, 0 -.endm - .macro ct_butterfly_v a, b, root, root_twisted mulmod tmp, \b, \root, \root_twisted sub \b\().8h, \a\().8h, tmp.8h diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a55.s b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a55.s new file mode 100644 index 00000000..4b948c7c --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a55.s @@ -0,0 +1,1746 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_manual_ld4_opt_a55 + .global _intt_dilithium_1234_5678_manual_ld4_opt_a55 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_manual_ld4_opt_a55: +_intt_dilithium_1234_5678_manual_ld4_opt_a55: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ldr q13, [x3, #80] // ..........* + // gap // ........... + // gap // ........... + // gap // ........... + ldr q6, [x0, #16] // .*......... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q24, [x0, #0] // *.......... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q14, [x0, #48] // ...*....... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q28, [x3, #32] // .......*... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q1, [x3, #48] // ........*.. + // gap // ........... + // gap // ........... + // gap // ........... + ldr q2, [x0, #32] // ..*........ + // gap // ........... + // gap // ........... + // gap // ........... + ldr q31, [x3], #(6*16) // ....*...... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q26, [x3, #-80] // ......*.... + // gap // ........... + // gap // ........... + // gap // ........... + trn1 v3.4S, v2.4S, v14.4S // .....*..... + // gap // ........... + ldr q15, [x3, #-32] // .........*. + // gap // ........... + + // original source code + // ldr q24, [x0, #0] // ..*........ + // ldr q6, [x0, #16] // .*......... + // ldr q2, [x0, #32] // ......*.... + // ldr q14, [x0, #48] // ...*....... + // ldr q31, [x3], #(6*16) // .......*... + // trn1 v3.4S, v2.4S, v14.4S // .........*. + // ldr q26, [x3, #-80] // ........*.. + // ldr q28, [x3, #-64] // ....*...... + // ldr q1, [x3, #-48] // .....*..... + // ldr q15, [x3, #-32] // ..........* + // ldr q13, [x3, #-16] // *.......... + + sub count, count, #1 +layer5678_start: + trn1 v17.4S, v24.4S, v6.4S // ....*....................................................................... + // gap // ............................................................................ + trn2 v21.4S, v24.4S, v6.4S // .....*...................................................................... + // gap // ............................................................................ + trn2 v14.4S, v2.4S, v14.4S // .......*.................................................................... + // gap // ............................................................................ + trn2 v2.2D, v17.2D, v3.2D // ........*................................................................... + // gap // ............................................................................ + trn1 v17.2D, v17.2D, v3.2D // ..........*................................................................. + // gap // ............................................................................ + trn2 v3.2D, v21.2D, v14.2D // .........*.................................................................. + // gap // ............................................................................ + trn1 v21.2D, v21.2D, v14.2D // ...........*................................................................ + // gap // ............................................................................ + sub v14.4S, v2.4S, v3.4S // .......................*.................................................... + // gap // ............................................................................ + add v2.4S, v2.4S, v3.4S // ........................*................................................... + // gap // ............................................................................ + sub v3.4S, v17.4S, v21.4S // ..................*......................................................... + // gap // ............................................................................ + add v17.4S, v17.4S, v21.4S // ...................*........................................................ + // gap // ............................................................................ + mul v21.4S, v14.4S, v15.4S // .........................*.................................................. + // gap // ............................................................................ + mul v24.4S, v3.4S, v28.4S // ....................*....................................................... + // gap // ............................................................................ + sqrdmulh v3.4S, v3.4S, v1.4S // .....................*...................................................... + // gap // ............................................................................ + sqrdmulh v14.4S, v14.4S, v13.4S // ..........................*................................................. + // gap // ............................................................................ + sub v28.4S, v17.4S, v2.4S // ............................*............................................... + // gap // ............................................................................ + add v17.4S, v17.4S, v2.4S // .............................*.............................................. + // gap // ............................................................................ + mls v24.4S, v3.4S, v29.4S // ......................*..................................................... + // gap // ............................................................................ + mls v21.4S, v14.4S, v29.4S // ...........................*................................................ + // gap // ............................................................................ + mul v14.4S, v28.4S, v31.4S // ..............................*............................................. + // gap // ............................................................................ + sqrdmulh v2.4S, v28.4S, v26.4S // ...............................*............................................ + // gap // ............................................................................ + ldr q3, [x4], #8 // ..............................................*............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v28.4S, v24.4S, v21.4S // .................................*.......................................... + // gap // ............................................................................ + mls v14.4S, v2.4S, v29.4S // ................................*........................................... + // gap // ............................................................................ + add v21.4S, v24.4S, v21.4S // ..................................*......................................... + // gap // ............................................................................ + mul v2.4S, v28.4S, v31.4S // ...................................*........................................ + // gap // ............................................................................ + sqrdmulh v24.4S, v28.4S, v26.4S // ....................................*....................................... + // gap // ............................................................................ + trn1 v28.4S, v17.4S, v21.4S // ......................................*..................................... + // gap // ............................................................................ + trn2 v17.4S, v17.4S, v21.4S // .......................................*.................................... + // gap // ............................................................................ + ldr q21, [x4], #16 // ...............................................*............................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v2.4S, v24.4S, v29.4S // .....................................*...................................... + // gap // ............................................................................ + ldr q24, [x0, #64] // e........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q6, [x0, #80] // .e.......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v31.4S, v14.4S, v2.4S // ........................................*................................... + // gap // ............................................................................ + trn2 v14.4S, v14.4S, v2.4S // .........................................*.................................. + // gap // ............................................................................ + ldr q2, [x0, #96] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v26.2D, v28.2D, v31.2D // ..........................................*................................. + // gap // ............................................................................ + trn2 v1.2D, v17.2D, v14.2D // ...........................................*................................ + // gap // ............................................................................ + trn1 v28.2D, v28.2D, v31.2D // ............................................*............................... + // gap // ............................................................................ + trn1 v17.2D, v17.2D, v14.2D // .............................................*.............................. + // gap // ............................................................................ + sub v14.4S, v26.4S, v1.4S // .....................................................*...................... + // gap // ............................................................................ + add v31.4S, v28.4S, v17.4S // .................................................*.......................... + // gap // ............................................................................ + sub v17.4S, v28.4S, v17.4S // ................................................*........................... + // gap // ............................................................................ + add v28.4S, v26.4S, v1.4S // ......................................................*..................... + // gap // ............................................................................ + mul v26.4S, v14.4S, v21.S[2] // .......................................................*.................... + // gap // ............................................................................ + mul v1.4S, v17.4S, v21.S[0] // ..................................................*......................... + // gap // ............................................................................ + sqrdmulh v17.4S, v17.4S, v21.S[1] // ...................................................*........................ + // gap // ............................................................................ + sqrdmulh v21.4S, v14.4S, v21.S[3] // ........................................................*................... + // gap // ............................................................................ + sub v14.4S, v31.4S, v28.4S // ..........................................................*................. + // gap // ............................................................................ + add v28.4S, v31.4S, v28.4S // ...........................................................*................ + // gap // ............................................................................ + mls v1.4S, v17.4S, v29.4S // ....................................................*....................... + // gap // ............................................................................ + mls v26.4S, v21.4S, v29.4S // .........................................................*.................. + // gap // ............................................................................ + mul v17.4S, v14.4S, v3.S[0] // ............................................................*............... + // gap // ............................................................................ + sqrdmulh v21.4S, v14.4S, v3.S[1] // .............................................................*.............. + // gap // ............................................................................ + srshr v14.4S, v28.4S, #23 // ....................................................................*....... + // gap // ............................................................................ + sub v31.4S, v1.4S, v26.4S // ...............................................................*............ + // gap // ............................................................................ + add v26.4S, v1.4S, v26.4S // ................................................................*........... + // gap // ............................................................................ + mls v17.4S, v21.4S, v29.4S // ..............................................................*............. + // gap // ............................................................................ + mul v21.4S, v31.4S, v3.S[0] // .................................................................*.......... + // gap // ............................................................................ + sqrdmulh v3.4S, v31.4S, v3.S[1] // ..................................................................*......... + // gap // ............................................................................ + mls v28.4S, v14.4S, v29.4S // .....................................................................*...... + // gap // ............................................................................ + srshr v14.4S, v26.4S, #23 // ......................................................................*..... + // gap // ............................................................................ + str q17, [x0, #32] // ..........................................................................*. + // gap // ............................................................................ + mls v21.4S, v3.4S, v29.4S // ...................................................................*........ + // gap // ............................................................................ + mls v26.4S, v14.4S, v29.4S // .......................................................................*.... + // gap // ............................................................................ + str q28, [x0], #(16*4) // ........................................................................*... + // gap // ............................................................................ + ldr q14, [x0, #48] // ...e........................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q26, [x0, #-48] // .........................................................................*.. + // gap // ............................................................................ + ldr q31, [x3], #(6*16) // ............e............................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q21, [x0, #-16] // ...........................................................................* + // gap // ............................................................................ + trn1 v3.4S, v2.4S, v14.4S // ......e..................................................................... + // gap // ............................................................................ + ldr q26, [x3, #-80] // .............e.............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q28, [x3, #-64] // ..............e............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q1, [x3, #-48] // ...............e............................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q15, [x3, #-32] // ................e........................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q13, [x3, #-16] // .................e.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + + // original source code + // ldr q8, [x0, #(16*0)] // e............................................|..............................e...................................... + // ldr q9, [x0, #(16*1)] // .e...........................................|...............................e..................................... + // ldr q10, [x0, #(16*2)] // ....e........................................|..................................e.................................. + // ldr q11, [x0, #(16*3)] // ...................................e.........|.................................................................e... + // trn1 v25.4s, v8.4s, v9.4s // .............................................*..................................................................... + // trn2 v26.4s, v8.4s, v9.4s // .............................................|*.................................................................... + // trn1 v27.4s, v10.4s, v11.4s // .......................................e.....|..................................................................... + // trn2 v28.4s, v10.4s, v11.4s // .............................................|.*................................................................... + // trn2 v10.2d, v25.2d, v27.2d // .............................................|..*.................................................................. + // trn2 v11.2d, v26.2d, v28.2d // .............................................|....*................................................................ + // trn1 v8.2d, v25.2d, v27.2d // .............................................|...*................................................................. + // trn1 v9.2d, v26.2d, v28.2d // .............................................|.....*............................................................... + // ldr q0, [x3], #(6*16) // .....................................e.......|...................................................................e. + // ldr q4, [x3, #(-6*16 + 1*16)] // ........................................e....|..................................................................... + // ldr q1, [x3, #(-6*16 + 2*16)] // .........................................e...|..................................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ..........................................e..|..................................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // ...........................................e.|..................................................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // ............................................e|..................................................................... + // sub v24.4s, v8.4s, v9.4s // .............................................|........*............................................................ + // add v8.4s, v8.4s, v9.4s // .............................................|.........*........................................................... + // mul v9.4s, v24.4s, v1.4s // .............................................|...........*......................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .............................................|............*........................................................ + // mls v9.4s, v24.4s, v29.4s // .............................................|................*.................................................... + // sub v24.4s, v10.4s, v11.4s // .............................................|......*.............................................................. + // add v10.4s, v10.4s, v11.4s // .............................................|.......*............................................................. + // mul v11.4s, v24.4s, v2.4s // .............................................|..........*.......................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // .............................................|.............*....................................................... + // mls v11.4s, v24.4s, v29.4s // .............................................|.................*................................................... + // sub v24.4s, v8.4s, v10.4s // .............................................|..............*...................................................... + // add v8.4s, v8.4s, v10.4s // .............................................|...............*..................................................... + // mul v10.4s, v24.4s, v0.4s // .............................................|..................*.................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................|...................*................................................. + // mls v10.4s, v24.4s, v29.4s // .............................................|......................*.............................................. + // sub v24.4s, v9.4s, v11.4s // .............................................|.....................*............................................... + // add v9.4s, v9.4s, v11.4s // .............................................|.......................*............................................. + // mul v11.4s, v24.4s, v0.4s // .............................................|........................*............................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................|.........................*........................................... + // mls v11.4s, v24.4s, v29.4s // .............................................|.............................*....................................... + // trn1 v25.4s, v8.4s, v9.4s // .............................................|..........................*.......................................... + // trn2 v26.4s, v8.4s, v9.4s // .............................................|...........................*......................................... + // trn1 v27.4s, v10.4s, v11.4s // ..*..........................................|................................*.................................... + // trn2 v28.4s, v10.4s, v11.4s // ...*.........................................|.................................*................................... + // trn2 v10.2d, v25.2d, v27.2d // .....*.......................................|...................................*................................. + // trn2 v11.2d, v26.2d, v28.2d // ......*......................................|....................................*................................ + // trn1 v8.2d, v25.2d, v27.2d // .......*.....................................|.....................................*............................... + // trn1 v9.2d, v26.2d, v28.2d // ........*....................................|......................................*.............................. + // ldr q1, [x4], #8 // .............................................|....................*................................................ + // ldr q0, [x4], #16 // .............................................|............................*........................................ + // sub v24.4s, v8.4s, v9.4s // ...........*.................................|.........................................*........................... + // add v8.4s, v8.4s, v9.4s // ..........*..................................|........................................*............................ + // mul v9.4s, v24.4s, v0.s[0] // ..............*..............................|............................................*........................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............*.............................|.............................................*....................... + // mls v9.4s, v24.4s, v29.4s // ...................*.........................|.................................................*................... + // sub v24.4s, v10.4s, v11.4s // .........*...................................|.......................................*............................. + // add v10.4s, v10.4s, v11.4s // ............*................................|..........................................*.......................... + // mul v11.4s, v24.4s, v0.s[2] // .............*...............................|...........................................*......................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................*............................|..............................................*...................... + // mls v11.4s, v24.4s, v29.4s // ....................*........................|..................................................*.................. + // sub v24.4s, v8.4s, v10.4s // .................*...........................|...............................................*..................... + // add v8.4s, v8.4s, v10.4s // ..................*..........................|................................................*.................... + // mul v10.4s, v24.4s, v1.s[0] // .....................*.......................|...................................................*................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................*......................|....................................................*................ + // mls v10.4s, v24.4s, v29.4s // ..........................*..................|........................................................*............ + // sub v24.4s, v9.4s, v11.4s // ........................*....................|......................................................*.............. + // add v9.4s, v9.4s, v11.4s // .........................*...................|.......................................................*............. + // mul v11.4s, v24.4s, v1.s[0] // ...........................*.................|.........................................................*........... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................*................|..........................................................*.......... + // mls v11.4s, v24.4s, v29.4s // ................................*............|..............................................................*...... + // srshr v24.4S, v8.4S, #23 // .......................*.....................|.....................................................*............... + // mls v8.4s, v24.4s, v29.4s // .............................*...............|...........................................................*......... + // srshr v24.4S, v9.4S, #23 // ..............................*..............|............................................................*........ + // mls v9.4s, v24.4s, v29.4s // .................................*...........|...............................................................*..... + // str q8, [x0], #(16*4) // ..................................*..........|................................................................*.... + // str q9, [x0, #(-16*4 + 1*16)] // ....................................*........|..................................................................*.. + // str q10, [x0, #(-16*4 + 2*16)] // ...............................*.............|.............................................................*....... + // str q11, [x0, #(-16*4 + 3*16)] // ......................................*......|....................................................................* + + sub count, count, #1 + cbnz count, layer5678_start + trn1 v9.4S, v24.4S, v6.4S // *................................................................ + // gap // ................................................................. + trn2 v17.4S, v2.4S, v14.4S // ..*.............................................................. + // gap // ................................................................. + trn2 v21.4S, v24.4S, v6.4S // .*............................................................... + // gap // ................................................................. + trn1 v16.2D, v9.2D, v3.2D // ....*............................................................ + // gap // ................................................................. + trn2 v30.2D, v9.2D, v3.2D // ...*............................................................. + // gap // ................................................................. + trn1 v0.2D, v21.2D, v17.2D // ......*.......................................................... + // gap // ................................................................. + trn2 v24.2D, v21.2D, v17.2D // .....*........................................................... + // gap // ................................................................. + sub v6.4S, v16.4S, v0.4S // .........*....................................................... + // gap // ................................................................. + sub v14.4S, v30.4S, v24.4S // .......*......................................................... + // gap // ................................................................. + add v23.4S, v30.4S, v24.4S // ........*........................................................ + // gap // ................................................................. + sqrdmulh v21.4S, v6.4S, v1.4S // .............*................................................... + // gap // ................................................................. + sqrdmulh v17.4S, v14.4S, v13.4S // ..............*.................................................. + // gap // ................................................................. + mul v2.4S, v14.4S, v15.4S // ...........*..................................................... + // gap // ................................................................. + mul v14.4S, v6.4S, v28.4S // ............*.................................................... + // gap // ................................................................. + add v8.4S, v16.4S, v0.4S // ..........*...................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v2.4S, v17.4S, v29.4S // ..................*.............................................. + // gap // ................................................................. + mls v14.4S, v21.4S, v29.4S // .................*............................................... + // gap // ................................................................. + sub v17.4S, v8.4S, v23.4S // ...............*................................................. + // gap // ................................................................. + ldr q4, [x4], #8 // .....................*........................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v28.4S, v14.4S, v2.4S // ......................*.......................................... + // gap // ................................................................. + sqrdmulh v16.4S, v17.4S, v26.4S // ....................*............................................ + // gap // ................................................................. + mul v20.4S, v17.4S, v31.4S // ...................*............................................. + // gap // ................................................................. + sqrdmulh v17.4S, v28.4S, v26.4S // ..........................*...................................... + // gap // ................................................................. + mul v3.4S, v28.4S, v31.4S // .........................*....................................... + // gap // ................................................................. + add v24.4S, v14.4S, v2.4S // ........................*........................................ + // gap // ................................................................. + add v28.4S, v8.4S, v23.4S // ................*................................................ + // gap // ................................................................. + mls v20.4S, v16.4S, v29.4S // .......................*......................................... + // gap // ................................................................. + mls v3.4S, v17.4S, v29.4S // ..............................*.................................. + // gap // ................................................................. + trn2 v2.4S, v28.4S, v24.4S // ............................*.................................... + // gap // ................................................................. + ldr q21, [x4], #16 // .............................*................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn2 v14.4S, v20.4S, v3.4S // ................................*................................ + // gap // ................................................................. + trn1 v17.4S, v20.4S, v3.4S // ...............................*................................. + // gap // ................................................................. + trn1 v3.4S, v28.4S, v24.4S // ...........................*..................................... + // gap // ................................................................. + trn1 v30.2D, v2.2D, v14.2D // ....................................*............................ + // gap // ................................................................. + trn2 v14.2D, v2.2D, v14.2D // ..................................*.............................. + // gap // ................................................................. + trn2 v28.2D, v3.2D, v17.2D // .................................*............................... + // gap // ................................................................. + trn1 v11.2D, v3.2D, v17.2D // ...................................*............................. + // gap // ................................................................. + sub v16.4S, v28.4S, v14.4S // .....................................*........................... + // gap // ................................................................. + add v15.4S, v28.4S, v14.4S // ........................................*........................ + // gap // ................................................................. + add v19.4S, v11.4S, v30.4S // ......................................*.......................... + // gap // ................................................................. + mul v31.4S, v16.4S, v21.S[2] // .........................................*....................... + // gap // ................................................................. + sub v30.4S, v11.4S, v30.4S // .......................................*......................... + // gap // ................................................................. + sqrdmulh v14.4S, v16.4S, v21.S[3] // ............................................*.................... + // gap // ................................................................. + add v28.4S, v19.4S, v15.4S // ..............................................*.................. + // gap // ................................................................. + sqrdmulh v2.4S, v30.4S, v21.S[1] // ...........................................*..................... + // gap // ................................................................. + mul v6.4S, v30.4S, v21.S[0] // ..........................................*...................... + // gap // ................................................................. + srshr v17.4S, v28.4S, #23 // ...................................................*............. + // gap // ................................................................. + sub v21.4S, v19.4S, v15.4S // .............................................*................... + // gap // ................................................................. + mls v31.4S, v14.4S, v29.4S // ................................................*................ + // gap // ................................................................. + mls v6.4S, v2.4S, v29.4S // ...............................................*................. + // gap // ................................................................. + sqrdmulh v3.4S, v21.4S, v4.S[1] // ..................................................*.............. + // gap // ................................................................. + mul v24.4S, v21.4S, v4.S[0] // .................................................*............... + // gap // ................................................................. + mls v28.4S, v17.4S, v29.4S // .........................................................*....... + // gap // ................................................................. + sub v21.4S, v6.4S, v31.4S // ....................................................*............ + // gap // ................................................................. + add v2.4S, v6.4S, v31.4S // .....................................................*........... + // gap // ................................................................. + mls v24.4S, v3.4S, v29.4S // ......................................................*.......... + // gap // ................................................................. + mul v14.4S, v21.4S, v4.S[0] // .......................................................*......... + // gap // ................................................................. + srshr v17.4S, v2.4S, #23 // ..........................................................*...... + // gap // ................................................................. + sqrdmulh v21.4S, v21.4S, v4.S[1] // ........................................................*........ + // gap // ................................................................. + str q24, [x0, #32] // ...........................................................*..... + // gap // ................................................................. + mls v2.4S, v17.4S, v29.4S // .............................................................*... + // gap // ................................................................. + str q28, [x0], #(16*4) // ..............................................................*.. + // gap // ................................................................. + mls v14.4S, v21.4S, v29.4S // ............................................................*.... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q2, [x0, #-48] // ...............................................................*. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q14, [x0, #-16] // ................................................................* + // gap // ................................................................. + + // original source code + // trn1 v17.4S, v24.4S, v6.4S // *................................................................ + // trn2 v21.4S, v24.4S, v6.4S // ..*.............................................................. + // trn2 v14.4S, v2.4S, v14.4S // .*............................................................... + // trn2 v2.2D, v17.2D, v3.2D // ....*............................................................ + // trn1 v17.2D, v17.2D, v3.2D // ...*............................................................. + // trn2 v3.2D, v21.2D, v14.2D // ......*.......................................................... + // trn1 v21.2D, v21.2D, v14.2D // .....*........................................................... + // sub v14.4S, v2.4S, v3.4S // ........*........................................................ + // add v2.4S, v2.4S, v3.4S // .........*....................................................... + // sub v3.4S, v17.4S, v21.4S // .......*......................................................... + // add v17.4S, v17.4S, v21.4S // ..............*.................................................. + // mul v21.4S, v14.4S, v15.4S // ............*.................................................... + // mul v24.4S, v3.4S, v28.4S // .............*................................................... + // sqrdmulh v3.4S, v3.4S, v1.4S // ..........*...................................................... + // sqrdmulh v14.4S, v14.4S, v13.4S // ...........*..................................................... + // sub v28.4S, v17.4S, v2.4S // .................*............................................... + // add v17.4S, v17.4S, v2.4S // .........................*....................................... + // mls v24.4S, v3.4S, v29.4S // ................*................................................ + // mls v21.4S, v14.4S, v29.4S // ...............*................................................. + // mul v14.4S, v28.4S, v31.4S // .....................*........................................... + // sqrdmulh v2.4S, v28.4S, v26.4S // ....................*............................................ + // ldr q3, [x4], #8 // ..................*.............................................. + // sub v28.4S, v24.4S, v21.4S // ...................*............................................. + // mls v14.4S, v2.4S, v29.4S // ..........................*...................................... + // add v21.4S, v24.4S, v21.4S // ........................*........................................ + // mul v2.4S, v28.4S, v31.4S // .......................*......................................... + // sqrdmulh v24.4S, v28.4S, v26.4S // ......................*.......................................... + // trn1 v28.4S, v17.4S, v21.4S // ................................*................................ + // trn2 v17.4S, v17.4S, v21.4S // ............................*.................................... + // ldr q21, [x4], #16 // .............................*................................... + // mls v2.4S, v24.4S, v29.4S // ...........................*..................................... + // trn1 v31.4S, v14.4S, v2.4S // ...............................*................................. + // trn2 v14.4S, v14.4S, v2.4S // ..............................*.................................. + // trn2 v26.2D, v28.2D, v31.2D // ...................................*............................. + // trn2 v1.2D, v17.2D, v14.2D // ..................................*.............................. + // trn1 v28.2D, v28.2D, v31.2D // ....................................*............................ + // trn1 v17.2D, v17.2D, v14.2D // .................................*............................... + // sub v14.4S, v26.4S, v1.4S // .....................................*........................... + // add v31.4S, v28.4S, v17.4S // .......................................*......................... + // sub v17.4S, v28.4S, v17.4S // .........................................*....................... + // add v28.4S, v26.4S, v1.4S // ......................................*.......................... + // mul v26.4S, v14.4S, v21.S[2] // ........................................*........................ + // mul v1.4S, v17.4S, v21.S[0] // .............................................*................... + // sqrdmulh v17.4S, v17.4S, v21.S[1] // ............................................*.................... + // sqrdmulh v21.4S, v14.4S, v21.S[3] // ..........................................*...................... + // sub v14.4S, v31.4S, v28.4S // ...............................................*................. + // add v28.4S, v31.4S, v28.4S // ...........................................*..................... + // mls v1.4S, v17.4S, v29.4S // .................................................*............... + // mls v26.4S, v21.4S, v29.4S // ................................................*................ + // mul v17.4S, v14.4S, v3.S[0] // ...................................................*............. + // sqrdmulh v21.4S, v14.4S, v3.S[1] // ..................................................*.............. + // srshr v14.4S, v28.4S, #23 // ..............................................*.................. + // sub v31.4S, v1.4S, v26.4S // .....................................................*........... + // add v26.4S, v1.4S, v26.4S // ......................................................*.......... + // mls v17.4S, v21.4S, v29.4S // .......................................................*......... + // mul v21.4S, v31.4S, v3.S[0] // ........................................................*........ + // sqrdmulh v3.4S, v31.4S, v3.S[1] // ..........................................................*...... + // mls v28.4S, v14.4S, v29.4S // ....................................................*............ + // srshr v14.4S, v26.4S, #23 // .........................................................*....... + // str q17, [x0, #32] // ...........................................................*..... + // mls v21.4S, v3.4S, v29.4S // ..............................................................*.. + // mls v26.4S, v14.4S, v29.4S // ............................................................*.... + // str q28, [x0], #(16*4) // .............................................................*... + // str q26, [x0, #-48] // ...............................................................*. + // str q21, [x0, #-16] // ................................................................* + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q15, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + ldr q28, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + ldr q10, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + sub v19.4S, v28.4S, v15.4S // ..............................................*......................................................................................................................................................................................................................................... + ldr q21, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q17, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q20, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + sub v18.4S, v21.4S, v17.4S // ....................................*................................................................................................................................................................................................................................................... + mul v12.4S, v19.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sub v23.4S, v10.4S, v20.4S // .........................................*.............................................................................................................................................................................................................................................. + sqrdmulh v13.4S, v18.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + mul v27.4S, v18.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + sqrdmulh v22.4S, v23.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + mul v11.4S, v23.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + ldr q23, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + mls v27.4S, v13.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + mls v11.4S, v22.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + ldr q18, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v19.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + sub v22.4S, v27.4S, v11.4S // .................................................................................*...................................................................................................................................................................................................... + add v14.4S, v18.4S, v23.4S // ...........................*............................................................................................................................................................................................................................................................ + sub v23.4S, v18.4S, v23.4S // ..........................*............................................................................................................................................................................................................................................................. + mul v19.4S, v22.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + sqrdmulh v22.4S, v22.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + sqrdmulh v8.4S, v23.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + mul v18.4S, v23.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + ldr q13, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + add v16.4S, v28.4S, v15.4S // ...............................................*........................................................................................................................................................................................................................................ + add v23.4S, v21.4S, v17.4S // .....................................*.................................................................................................................................................................................................................................................. + ldr q17, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + ldr q28, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + add v9.4S, v17.4S, v13.4S // ....................................................*................................................................................................................................................................................................................................... + sub v13.4S, v17.4S, v13.4S // ...................................................*.................................................................................................................................................................................................................................... + mls v18.4S, v8.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + sub v15.4S, v16.4S, v9.4S // ......................................................................................*................................................................................................................................................................................................. + mul v21.4S, v13.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v8.4S, v13.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + sqrdmulh v17.4S, v15.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + mul v15.4S, v15.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + mls v12.4S, v24.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + mls v21.4S, v8.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + add v20.4S, v10.4S, v20.4S // ..........................................*............................................................................................................................................................................................................................................. + mls v15.4S, v17.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + add v27.4S, v27.4S, v11.4S // ..................................................................................*..................................................................................................................................................................................................... + add v24.4S, v12.4S, v21.4S // ............................................................................................*........................................................................................................................................................................................... + ldr q11, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + add v17.4S, v27.4S, v24.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v10.4S, v27.4S, v24.4S // .........................................................................................................................*.............................................................................................................................................................. + sub v27.4S, v28.4S, v11.4S // ...............................*........................................................................................................................................................................................................................................................ + add v24.4S, v28.4S, v11.4S // ................................*....................................................................................................................................................................................................................................................... + add v13.4S, v16.4S, v9.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v9.4S, v10.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + sub v16.4S, v14.4S, v24.4S // ..................................................................*..................................................................................................................................................................................................................... + sqrdmulh v11.4S, v27.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + mul v8.4S, v27.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + mul v27.4S, v10.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + sub v10.4S, v23.4S, v20.4S // ............................................................................*........................................................................................................................................................................................................... + mls v8.4S, v11.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v11.4S, v16.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + mul v28.4S, v16.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + add v24.4S, v14.4S, v24.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v27.4S, v9.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sub v9.4S, v18.4S, v8.4S // .......................................................................*................................................................................................................................................................................................................ + ldr q16, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + sub v14.4S, v12.4S, v21.4S // ...........................................................................................*............................................................................................................................................................................................ + ldr q12, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + add v21.4S, v23.4S, v20.4S // .............................................................................*.......................................................................................................................................................................................................... + sqrdmulh v23.4S, v10.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + sub v20.4S, v16.4S, v12.4S // .....................*.................................................................................................................................................................................................................................................................. + add v16.4S, v16.4S, v12.4S // ......................*................................................................................................................................................................................................................................................................. + mls v19.4S, v22.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + mul v22.4S, v9.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + sub v12.4S, v21.4S, v13.4S // ....................................................................................................................*................................................................................................................................................................... + add v18.4S, v18.4S, v8.4S // ........................................................................*............................................................................................................................................................................................................... + mul v8.4S, v20.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + sqrdmulh v20.4S, v20.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + add v13.4S, v21.4S, v13.4S // .....................................................................................................................*.................................................................................................................................................................. + sqrdmulh v21.4S, v9.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + sqrdmulh v9.4S, v12.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mls v8.4S, v20.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + mul v20.4S, v12.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + ldr q12, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + mls v28.4S, v11.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + ldr q11, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + mls v20.4S, v9.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + sub v9.4S, v12.4S, v11.4S // ................*....................................................................................................................................................................................................................................................................... + mul v10.4S, v10.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + add v11.4S, v12.4S, v11.4S // .................*...................................................................................................................................................................................................................................................................... + sqrdmulh v12.4S, v9.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + mul v9.4S, v9.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + mls v10.4S, v23.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + add v23.4S, v11.4S, v16.4S // .........................................................*.............................................................................................................................................................................................................................. + mls v22.4S, v21.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + mls v9.4S, v12.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sub v21.4S, v23.4S, v24.4S // ................................................................................................*....................................................................................................................................................................................... + add v24.4S, v23.4S, v24.4S // .................................................................................................*...................................................................................................................................................................................... + sub v23.4S, v11.4S, v16.4S // ........................................................*............................................................................................................................................................................................................................... + sub v16.4S, v9.4S, v8.4S // .............................................................*.......................................................................................................................................................................................................................... + add v9.4S, v9.4S, v8.4S // ..............................................................*......................................................................................................................................................................................................................... + mul v12.4S, v21.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sqrdmulh v8.4S, v16.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + mul v16.4S, v16.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + add v11.4S, v9.4S, v18.4S // ......................................................................................................*................................................................................................................................................................................. + sub v18.4S, v9.4S, v18.4S // .....................................................................................................*.................................................................................................................................................................................. + sqrdmulh v9.4S, v21.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mls v16.4S, v8.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + mul v21.4S, v18.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + sqrdmulh v8.4S, v18.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mls v12.4S, v9.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + sub v18.4S, v11.4S, v17.4S // .............................................................................................................................................*.......................................................................................................................................... + add v9.4S, v11.4S, v17.4S // ..............................................................................................................................................*......................................................................................................................................... + mls v21.4S, v8.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sqrdmulh v11.4S, v23.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + mul v8.4S, v23.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + mul v23.4S, v18.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + sqrdmulh v17.4S, v14.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mul v14.4S, v14.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + mls v8.4S, v11.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sqrdmulh v11.4S, v18.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + add v18.4S, v10.4S, v15.4S // ...............................................................................................................................*........................................................................................................................................................ + mls v14.4S, v17.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sub v17.4S, v8.4S, v28.4S // ..........................................................................................................*............................................................................................................................................................................. + add v28.4S, v8.4S, v28.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v15.4S, v10.4S, v15.4S // ..............................................................................................................................*......................................................................................................................................................... + add v10.4S, v19.4S, v14.4S // ....................................................................................................................................*................................................................................................................................................... + add v8.4S, v21.4S, v27.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v21.4S, v21.4S, v27.4S // .................................................................................................................................................................*...................................................................................................................... + sub v19.4S, v19.4S, v14.4S // ...................................................................................................................................*.................................................................................................................................................... + mul v14.4S, v15.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sqrdmulh v27.4S, v15.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mul v15.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mls v23.4S, v11.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sqrdmulh v11.4S, v19.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mls v14.4S, v27.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mls v15.4S, v21.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v21.4S, v31.4S, v23.4S // ....................................................................................................................................................................................*................................................................................................... + cmge v27.4S, v23.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mul v19.4S, v19.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sub v27.4S, v21.4S, v27.4S // ......................................................................................................................................................................................*................................................................................................. + sub count, count, #1 +layer1234_start: + mul v21.4S, v8.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sqrdmulh v8.4S, v8.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mls v19.4S, v11.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + sub v11.4S, v28.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v23.4S, v27.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mls v21.4S, v8.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + mul v8.4S, v17.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sqrdmulh v27.4S, v17.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + add v28.4S, v28.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + cmge v18.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + cmge v17.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mls v8.4S, v27.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + sub v18.4S, v18.4S, v17.4S // ......................................................................................................................................................................................................................................................................*................. + sqrdmulh v17.4S, v11.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + cmge v27.4S, v31.4S, v15.4S // ....................................................................................................................................................................................................*................................................................................... + mul v11.4S, v11.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + str q23, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + add v23.4S, v16.4S, v22.4S // ................................................................................................................*....................................................................................................................................................................... + sub v22.4S, v16.4S, v22.4S // ...............................................................................................................*........................................................................................................................................................................ + mls v11.4S, v17.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + add v16.4S, v12.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... + mul v17.4S, v22.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + sub v12.4S, v12.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... + add v20.4S, v8.4S, v14.4S // .......................................................................................................................................................................*................................................................................................................ + sub v14.4S, v8.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. + mls v21.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sqrdmulh v8.4S, v20.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mul v18.4S, v20.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + sub v20.4S, v24.4S, v13.4S // ........................................................................................................................................*............................................................................................................................................... + add v24.4S, v24.4S, v13.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v18.4S, v8.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mul v8.4S, v20.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + sqrdmulh v22.4S, v22.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + cmge v13.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................................*............... + str q21, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + mls v8.4S, v20.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mls v17.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sub v20.4S, v13.4S, v20.4S // ..........................................................................................................................................................................................................................................................................*............. + cmge v21.4S, v31.4S, v8.4S // ................................................................................................................................................................................*....................................................................................................... + cmge v13.4S, v8.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + sub v21.4S, v21.4S, v13.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v13.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mul v20.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + str q18, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. + mul v18.4S, v24.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v22.4S, v24.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mls v20.4S, v13.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + add v9.4S, v23.4S, v10.4S // ........................................................................................................................................................*............................................................................................................................... + mul v24.4S, v28.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mls v18.4S, v22.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + mls v8.4S, v21.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + cmge v21.4S, v11.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + cmge v13.4S, v20.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v22.4S, v15.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + sub v23.4S, v23.4S, v10.4S // .......................................................................................................................................................*................................................................................................................................ + sub v22.4S, v27.4S, v22.4S // ......................................................................................................................................................................................................*................................................................................. + sqrdmulh v10.4S, v12.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + cmge v27.4S, v31.4S, v20.4S // ....................................................................................................................................................................................................................................................*................................... + mls v15.4S, v22.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sub v22.4S, v27.4S, v13.4S // ......................................................................................................................................................................................................................................................*................................. + sqrdmulh v28.4S, v28.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + add v27.4S, v17.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... + mls v20.4S, v22.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sqrdmulh v13.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v24.4S, v28.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mul v28.4S, v27.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + str q20, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + mul v20.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v22.4S, v27.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + cmge v23.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + cmge v27.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................................................................*............................... + mls v20.4S, v13.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mls v28.4S, v22.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v13.4S, v27.4S, v23.4S // ..........................................................................................................................................................................................................................................................*............................. + sub v17.4S, v17.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + str q8, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + cmge v23.4S, v31.4S, v28.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v19.4S, v28.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + sqrdmulh v22.4S, v17.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + sub v27.4S, v23.4S, v19.4S // ..............................................................................................................................................................................................................................................................................*......... + mul v8.4S, v12.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + mul v12.4S, v14.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v14.4S, v14.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + ldr q23, [x1, #144] // ..e..................................................................................................................................................................................................................................................................................... + mls v24.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + mls v12.4S, v14.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + ldr q19, [x1, #208] // ...e.................................................................................................................................................................................................................................................................................... + mls v28.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + cmge v13.4S, v12.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v14.4S, v31.4S, v12.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v27.4S, v20.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + sub v14.4S, v14.4S, v13.4S // ..........................................................................................................................................................................................................*............................................................................. + cmge v13.4S, v31.4S, v11.4S // ........................................................................................................................................................................................*............................................................................................... + mls v8.4S, v10.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mls v12.4S, v14.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sub v21.4S, v13.4S, v21.4S // ..........................................................................................................................................................................................*............................................................................................. + str q15, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + mul v14.4S, v16.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v11.4S, v21.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + str q12, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + sqrdmulh v13.4S, v16.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + sub v15.4S, v23.4S, v19.4S // .....................e.................................................................................................................................................................................................................................................................. + str q11, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + add v11.4S, v23.4S, v19.4S // ......................e................................................................................................................................................................................................................................................................. + ldr q23, [x1, #80] // .e...................................................................................................................................................................................................................................................................................... + str q28, [x1, #448] // .......................................................................................................................................................................................................................................................................................* + cmge v16.4S, v31.4S, v20.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v21.4S, v8.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + sub v12.4S, v16.4S, v27.4S // ..............................................................................................................................................................................................*......................................................................................... + ldr q10, [x1, #16] // e....................................................................................................................................................................................................................................................................................... + ldr q27, [x1, #656] // ..........e............................................................................................................................................................................................................................................................................. + sub v19.4S, v10.4S, v23.4S // ................e....................................................................................................................................................................................................................................................................... + mul v16.4S, v15.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + sqrdmulh v28.4S, v15.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + mul v15.4S, v19.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + add v23.4S, v10.4S, v23.4S // .................e...................................................................................................................................................................................................................................................................... + mls v20.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v16.4S, v28.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + mls v15.4S, v19.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + sub v19.4S, v23.4S, v11.4S // ........................................................e............................................................................................................................................................................................................................... + str q20, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + add v20.4S, v23.4S, v11.4S // .........................................................e.............................................................................................................................................................................................................................. + mul v12.4S, v17.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + cmge v28.4S, v31.4S, v8.4S // ................................................................................................................................................................................................*....................................................................................... + mul v10.4S, v9.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sub v11.4S, v28.4S, v21.4S // ..................................................................................................................................................................................................*..................................................................................... + ldr q23, [x1, #720] // ...........e............................................................................................................................................................................................................................................................................ + mls v8.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + add v11.4S, v15.4S, v16.4S // ..............................................................e......................................................................................................................................................................................................................... + sub v28.4S, v15.4S, v16.4S // .............................................................e.......................................................................................................................................................................................................................... + sqrdmulh v16.4S, v9.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + ldr q15, [x1, #912] // ..............e......................................................................................................................................................................................................................................................................... + sub v17.4S, v27.4S, v23.4S // .........................................e.............................................................................................................................................................................................................................................. + mls v10.4S, v16.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + add v16.4S, v27.4S, v23.4S // ..........................................e............................................................................................................................................................................................................................................. + mls v12.4S, v22.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v21.4S, v18.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v9.4S, v31.4S, v18.4S // ................................................................................................................................................................................................................................................*....................................... + mls v14.4S, v13.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sub v23.4S, v9.4S, v21.4S // ..................................................................................................................................................................................................................................................*..................................... + ldr q22, [x1, #272] // ....e................................................................................................................................................................................................................................................................................... + cmge v13.4S, v31.4S, v14.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v21.4S, v14.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + sqrdmulh v27.4S, v19.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + sub v9.4S, v13.4S, v21.4S // ..................................................................................................................................................................................................................................................................*..................... + ldr q13, [x1, #784] // ............e........................................................................................................................................................................................................................................................................... + mls v14.4S, v9.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v18.4S, v23.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + ldr q23, [x1, #848] // .............e.......................................................................................................................................................................................................................................................................... + str q14, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... + cmge v14.4S, v12.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + str q18, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v18.4S, v31.4S, v10.4S // ............................................................................................................................................................................................................................................................*........................... + str q24, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + add v24.4S, v13.4S, v23.4S // ...............................................e........................................................................................................................................................................................................................................ + sub v13.4S, v13.4S, v23.4S // ..............................................e......................................................................................................................................................................................................................................... + mul v23.4S, v19.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + cmge v9.4S, v31.4S, v12.4S // ............................................................................................................................................................................................................*........................................................................... + ldr q19, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + mls v23.4S, v27.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + ldr q27, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + sub v9.4S, v9.4S, v14.4S // ..............................................................................................................................................................................................................*......................................................................... + str q8, [x1, #752] // ....................................................................................................................................................................................................................*................................................................... + add v14.4S, v19.4S, v27.4S // .....................................e.................................................................................................................................................................................................................................................. + ldr q8, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + sub v21.4S, v19.4S, v27.4S // ....................................e................................................................................................................................................................................................................................................... + sqrdmulh v19.4S, v13.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + sub v27.4S, v15.4S, v8.4S // ...................................................e.................................................................................................................................................................................................................................... + mul v13.4S, v13.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + add v8.4S, v15.4S, v8.4S // ....................................................e................................................................................................................................................................................................................................... + mul v15.4S, v27.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sqrdmulh v27.4S, v27.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mls v13.4S, v19.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + mul v19.4S, v21.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + sqrdmulh v21.4S, v21.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + mls v15.4S, v27.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mls v12.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v9.4S, v10.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v19.4S, v21.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + sub v27.4S, v13.4S, v15.4S // ...........................................................................................e............................................................................................................................................................................................ + add v13.4S, v13.4S, v15.4S // ............................................................................................e........................................................................................................................................................................................... + sub v18.4S, v18.4S, v9.4S // ..............................................................................................................................................................................................................................................................*......................... + mul v21.4S, v27.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + sqrdmulh v9.4S, v27.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + mls v10.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + mul v18.4S, v17.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + sqrdmulh v27.4S, v17.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + mls v21.4S, v9.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + sub v17.4S, v14.4S, v16.4S // ............................................................................e........................................................................................................................................................................................................... + sqrdmulh v15.4S, v28.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + mls v18.4S, v27.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v17.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + ldr q27, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + add v16.4S, v14.4S, v16.4S // .............................................................................e.......................................................................................................................................................................................................... + str q10, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + sub v14.4S, v22.4S, v27.4S // ..........................e............................................................................................................................................................................................................................................................. + str q12, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + add v12.4S, v19.4S, v18.4S // ..................................................................................e..................................................................................................................................................................................................... + mul v10.4S, v14.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + add v22.4S, v22.4S, v27.4S // ...........................e............................................................................................................................................................................................................................................................ + sub v27.4S, v12.4S, v13.4S // .........................................................................................................................e.............................................................................................................................................................. + add v12.4S, v12.4S, v13.4S // ..........................................................................................................................e............................................................................................................................................................. + mls v10.4S, v14.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + sqrdmulh v13.4S, v27.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mul v14.4S, v27.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + add v27.4S, v24.4S, v8.4S // .......................................................................................e................................................................................................................................................................................................ + sub v8.4S, v24.4S, v8.4S // ......................................................................................e................................................................................................................................................................................................. + ldr q24, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + mls v14.4S, v13.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... + add v13.4S, v16.4S, v27.4S // .....................................................................................................................e.................................................................................................................................................................. + sub v27.4S, v16.4S, v27.4S // ....................................................................................................................e................................................................................................................................................................... + mul v16.4S, v28.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sub v28.4S, v19.4S, v18.4S // .................................................................................e...................................................................................................................................................................................................... + mul v18.4S, v8.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + sqrdmulh v19.4S, v8.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + ldr q8, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + mls v16.4S, v15.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + mul v15.4S, v28.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + mls v18.4S, v19.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + sub v19.4S, v8.4S, v24.4S // ...............................e........................................................................................................................................................................................................................................................ + sqrdmulh v28.4S, v28.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + add v24.4S, v8.4S, v24.4S // ................................e....................................................................................................................................................................................................................................................... + mul v8.4S, v19.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + mls v15.4S, v28.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + sub v28.4S, v22.4S, v24.4S // ..................................................................e..................................................................................................................................................................................................................... + mul v17.4S, v17.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + mls v8.4S, v19.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + mul v19.4S, v28.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + sqrdmulh v28.4S, v28.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + mls v17.4S, v9.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + add v9.4S, v10.4S, v8.4S // ........................................................................e............................................................................................................................................................................................................... + sub v8.4S, v10.4S, v8.4S // .......................................................................e................................................................................................................................................................................................................ + mls v19.4S, v28.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + sub v28.4S, v11.4S, v9.4S // .....................................................................................................e.................................................................................................................................................................................. + sub v10.4S, v17.4S, v18.4S // ..............................................................................................................................e......................................................................................................................................................... + add v11.4S, v11.4S, v9.4S // ......................................................................................................e................................................................................................................................................................................. + mul v9.4S, v28.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + sqrdmulh v28.4S, v28.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + add v24.4S, v22.4S, v24.4S // ...................................................................e.................................................................................................................................................................................................................... + add v18.4S, v17.4S, v18.4S // ...............................................................................................................................e........................................................................................................................................................ + mul v22.4S, v8.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + mls v9.4S, v28.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + add v28.4S, v23.4S, v19.4S // ...........................................................................................................e............................................................................................................................................................................ + sub v17.4S, v23.4S, v19.4S // ..........................................................................................................e............................................................................................................................................................................. + sub v23.4S, v20.4S, v24.4S // ................................................................................................e....................................................................................................................................................................................... + sub v19.4S, v9.4S, v14.4S // .................................................................................................................................................................e...................................................................................................................... + add v24.4S, v20.4S, v24.4S // .................................................................................................e...................................................................................................................................................................................... + sqrdmulh v20.4S, v8.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + add v8.4S, v9.4S, v14.4S // ..................................................................................................................................................................e..................................................................................................................... + mul v14.4S, v10.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + add v9.4S, v11.4S, v12.4S // ..............................................................................................................................................e......................................................................................................................................... + sub v11.4S, v11.4S, v12.4S // .............................................................................................................................................e.......................................................................................................................................... + mul v12.4S, v23.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + mls v22.4S, v20.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + sqrdmulh v10.4S, v10.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... + sqrdmulh v20.4S, v23.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v23.4S, v11.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + mls v14.4S, v10.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... + add v10.4S, v15.4S, v21.4S // ....................................................................................................................................e................................................................................................................................................... + mls v12.4S, v20.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + sub v21.4S, v15.4S, v21.4S // ...................................................................................................................................e.................................................................................................................................................... + mul v20.4S, v27.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + sqrdmulh v27.4S, v27.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mls v23.4S, v11.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... + sqrdmulh v11.4S, v19.4S, v0.S[1] // ....................................................................................................................................................................e................................................................................................................... + mul v15.4S, v19.4S, v0.S[0] // ...................................................................................................................................................................e.................................................................................................................... + mls v20.4S, v27.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + cmge v27.4S, v31.4S, v23.4S // ....................................................................................................................................................................................e................................................................................................... + cmge v19.4S, v23.4S, v30.4S // .....................................................................................................................................................................................e.................................................................................................. + mls v15.4S, v11.4S, v29.4S // .....................................................................................................................................................................e.................................................................................................................. + sub v27.4S, v27.4S, v19.4S // ......................................................................................................................................................................................e................................................................................................. + sqrdmulh v11.4S, v21.4S, v1.S[1] // ......................................................................................................................................e................................................................................................................................................. + mul v19.4S, v21.4S, v1.S[0] // .....................................................................................................................................e.................................................................................................................................................. + + // original source code + // ldr q8, [x1, #0] // ..........................e.......................................................................................................................................................................|...............................................................................................................e........................................................................................ + // ldr q9, [x1, #(1*(512/8))] // .....................e............................................................................................................................................................................|..........................................................................................................e............................................................................................. + // ldr q10, [x1, #(2*(512/8))] // e.................................................................................................................................................................................................|.....................................................................................e.................................................................................................................. + // ldr q11, [x1, #(3*(512/8))] // ...e..............................................................................................................................................................................................|........................................................................................e............................................................................................................... + // ldr q12, [x1, #(4*(512/8))] // ..........................................................e.......................................................................................................................................|...............................................................................................................................................e........................................................ + // ldr q13, [x1, #(5*(512/8))] // ..............................................................................................................e...................................................................................|...................................................................................................................................................................................................e.... + // ldr q14, [x1, #(6*(512/8))] // ......................................................................................................................................e...........................................................|........................................................................................................................................................................................................ + // ldr q15, [x1, #(7*(512/8))] // ..............................................................................................................................e...................................................................|........................................................................................................................................................................................................ + // ldr q16, [x1, #(8*(512/8))] // ............................................................................e.....................................................................................................................|.................................................................................................................................................................e...................................... + // ldr q17, [x1, #(9*(512/8))] // ..............................................................................e...................................................................................................................|...................................................................................................................................................................e.................................... + // ldr q18, [x1, #(10*(512/8))] // ...........................e......................................................................................................................................................................|................................................................................................................e....................................................................................... + // ldr q19, [x1, #(11*(512/8))] // ............................................e.....................................................................................................................................................|.................................................................................................................................e...................................................................... + // ldr q20, [x1, #(12*(512/8))] // ...............................................................e..................................................................................................................................|....................................................................................................................................................e................................................... + // ldr q21, [x1, #(13*(512/8))] // ..................................................................e...............................................................................................................................|.......................................................................................................................................................e................................................ + // ldr q22, [x1, #(14*(512/8))] // .................................................e................................................................................................................................................|......................................................................................................................................e................................................................. + // ldr q23, [x1, #(15*(512/8))] // ..................................................................................e...............................................................................................................|.......................................................................................................................................................................e................................ + // sub v24.4s, v8.4s, v9.4s // ............................e.....................................................................................................................................................................|.................................................................................................................e...................................................................................... + // add v8.4s, v8.4s, v9.4s // .................................e................................................................................................................................................................|......................................................................................................................e................................................................................. + // mul v9.4s, v24.4s, v3.s[2] // ...............................e..................................................................................................................................................................|....................................................................................................................e................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ................................e.................................................................................................................................................................|.....................................................................................................................e.................................................................................. + // mls v9.4s, v24.4s, v29.4s // ....................................e.............................................................................................................................................................|.........................................................................................................................e.............................................................................. + // sub v24.4s, v10.4s, v11.4s // ..................e...............................................................................................................................................................................|.......................................................................................................e................................................................................................ + // add v10.4s, v10.4s, v11.4s // ....................e.............................................................................................................................................................................|.........................................................................................................e.............................................................................................. + // mul v11.4s, v24.4s, v4.s[0] // .............................e....................................................................................................................................................................|..................................................................................................................e..................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ..............................e...................................................................................................................................................................|...................................................................................................................e.................................................................................... + // mls v11.4s, v24.4s, v29.4s // ...................................e..............................................................................................................................................................|........................................................................................................................e............................................................................... + // sub v24.4s, v12.4s, v13.4s // .................................................................................................................e................................................................................|......................................................................................................................................................................................................e. + // add v12.4s, v12.4s, v13.4s // ......................................................................................................................e...........................................................................|........................................................................................................................................................................................................ + // mul v13.4s, v24.4s, v4.s[2] // ....................................................................................................................e.............................................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.s[3] // .....................................................................................................................e............................................................................|........................................................................................................................................................................................................ + // mls v13.4s, v24.4s, v29.4s // .........................................................................................................................e........................................................................|........................................................................................................................................................................................................ + // sub v24.4s, v14.4s, v15.4s // ..........................................................................................................................................e.......................................................|........................................................................................................................................................................................................ + // add v14.4s, v14.4s, v15.4s // ............................................................................................................................................e.....................................................|........................................................................................................................................................................................................ + // mul v15.4s, v24.4s, v5.s[0] // .............................................................................................................................................e....................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.s[1] // ..............................................................................................................................................e...................................................|........................................................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // ..................................................................................................................................................e...............................................|........................................................................................................................................................................................................ + // sub v24.4s, v16.4s, v17.4s // ...................................................................................e..............................................................................................................|........................................................................................................................................................................e............................... + // add v16.4s, v16.4s, v17.4s // .................................................................................e................................................................................................................|......................................................................................................................................................................e................................. + // mul v17.4s, v24.4s, v5.s[2] // ...........................................................................................e......................................................................................................|................................................................................................................................................................................e....................... + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ............................................................................................e.....................................................................................................|.................................................................................................................................................................................e...................... + // mls v17.4s, v24.4s, v29.4s // ................................................................................................e.................................................................................................|.....................................................................................................................................................................................e.................. + // sub v24.4s, v18.4s, v19.4s // ..................................................e...............................................................................................................................................|.......................................................................................................................................e................................................................ + // add v18.4s, v18.4s, v19.4s // ....................................................e.............................................................................................................................................|.........................................................................................................................................e.............................................................. + // mul v19.4s, v24.4s, v6.s[0] // .......................................................................................................e..........................................................................................|............................................................................................................................................................................................e........... + // sqrdmulh v24.4s, v24.4s, v6.s[1] // ........................................................................................................e.........................................................................................|.............................................................................................................................................................................................e.......... + // mls v19.4s, v24.4s, v29.4s // ............................................................................................................e.....................................................................................|.................................................................................................................................................................................................e...... + // sub v24.4s, v20.4s, v21.4s // .........................................................................e........................................................................................................................|..............................................................................................................................................................e......................................... + // add v20.4s, v20.4s, v21.4s // ........................................................................e.........................................................................................................................|.............................................................................................................................................................e.......................................... + // mul v21.4s, v24.4s, v6.s[2] // ......................................................................................e...........................................................................................................|...........................................................................................................................................................................e............................ + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ....................................................................................e.............................................................................................................|.........................................................................................................................................................................e.............................. + // mls v21.4s, v24.4s, v29.4s // ..........................................................................................e.......................................................................................................|...............................................................................................................................................................................e........................ + // sub v24.4s, v22.4s, v23.4s // .....................................................................................e............................................................................................................|..........................................................................................................................................................................e............................. + // add v22.4s, v22.4s, v23.4s // .......................................................................................e..........................................................................................................|............................................................................................................................................................................e........................... + // mul v23.4s, v24.4s, v7.s[0] // ........................................................................................e.........................................................................................................|.............................................................................................................................................................................e.......................... + // sqrdmulh v24.4s, v24.4s, v7.s[1] // .........................................................................................e........................................................................................................|..............................................................................................................................................................................e......................... + // mls v23.4s, v24.4s, v29.4s // .............................................................................................e....................................................................................................|..................................................................................................................................................................................e..................... + // sub v24.4s, v8.4s, v10.4s // .....................................e............................................................................................................................................................|..........................................................................................................................e............................................................................. + // add v8.4s, v8.4s, v10.4s // .......................................e..........................................................................................................................................................|............................................................................................................................e........................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ..........................................................................e.......................................................................................................................|...............................................................................................................................................................e........................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................................e....................................................................................................................................|..................................................................................................................................................e..................................................... + // mls v10.4s, v24.4s, v29.4s // .............................................................................e....................................................................................................................|..................................................................................................................................................................e..................................... + // sub v24.4s, v9.4s, v11.4s // ...............................................e..................................................................................................................................................|....................................................................................................................................e................................................................... + // add v9.4s, v9.4s, v11.4s // ..............................................e...................................................................................................................................................|...................................................................................................................................e.................................................................... + // mul v11.4s, v24.4s, v1.s[2] // ..................................................................................................................................e...............................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...........................................................................................................e......................................................................................|................................................................................................................................................................................................e....... + // mls v11.4s, v24.4s, v29.4s // .......................................................................................................................................e..........................................................|........................................................................................................................................................................................................ + // sub v24.4s, v12.4s, v14.4s // ................................................................................................................................................e.................................................|........................................................................................................................................................................................................ + // add v12.4s, v12.4s, v14.4s // ..............................................................................................................................................................e...................................|........................................................................................................................................................................................................ + // mul v14.4s, v24.4s, v2.s[0] // ...................................................................................................................................................e..............................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ....................................................................................................................................................e.............................................|........................................................................................................................................................................................................ + // mls v14.4s, v24.4s, v29.4s // ........................................................................................................................................................e.........................................|........................................................................................................................................................................................................ + // sub v24.4s, v13.4s, v15.4s // .......................................................................................................................................................e..........................................|........................................................................................................................................................................................................ + // add v13.4s, v13.4s, v15.4s // ......................................................................................................................................................e...........................................|........................................................................................................................................................................................................ + // mul v15.4s, v24.4s, v2.s[0] // ................................................................................................................................................................e.................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .......................................................................................................................................................................e..........................|........................................................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // .............................................................................................................................................................................e....................|........................................................................................................................................................................................................ + // sub v24.4s, v16.4s, v18.4s // ..........................................................................................................e.......................................................................................|...............................................................................................................................................................................................e........ + // add v16.4s, v16.4s, v18.4s // ...............................................................................................................e..................................................................................|....................................................................................................................................................................................................e... + // mul v18.4s, v24.4s, v2.s[2] // .................................................................................................................................................e................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .............................................................................................................e....................................................................................|..................................................................................................................................................................................................e..... + // mls v18.4s, v24.4s, v29.4s // .....................................................................................................................................................e............................................|........................................................................................................................................................................................................ + // sub v24.4s, v17.4s, v19.4s // ...................................................................................................................................e..............................................................|........................................................................................................................................................................................................ + // add v17.4s, v17.4s, v19.4s // ...................................................................................................................e..............................................................................|........................................................................................................................................................................................................ + // mul v19.4s, v24.4s, v2.s[2] // ........................................................................................................................................e.........................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...........................................................................................................................................e......................................................|........................................................................................................................................................................................................ + // mls v19.4s, v24.4s, v29.4s // ...............................................................................................................................................e..................................................|........................................................................................................................................................................................................ + // sub v24.4s, v20.4s, v22.4s // .............................................................................................................................e....................................................................|........................................................................................................................................................................................................ + // add v20.4s, v20.4s, v22.4s // ............................................................................................................................e.....................................................................|........................................................................................................................................................................................................ + // mul v22.4s, v24.4s, v3.s[0] // ....................................................................................................................................e.............................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .....................................................................................................................................e............................................................|........................................................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // .........................................................................................................................................e........................................................|........................................................................................................................................................................................................ + // sub v24.4s, v21.4s, v23.4s // .................................................................................................e................................................................................................|......................................................................................................................................................................................e................. + // add v21.4s, v21.4s, v23.4s // ..................................................................................................e...............................................................................................|.......................................................................................................................................................................................e................ + // mul v23.4s, v24.4s, v3.s[0] // ....................................................................................................e.............................................................................................|.........................................................................................................................................................................................e.............. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .....................................................................................................e............................................................................................|..........................................................................................................................................................................................e............. + // mls v23.4s, v24.4s, v29.4s // .........................................................................................................e........................................................................................|..............................................................................................................................................................................................e......... + // sub v24.4s, v8.4s, v12.4s // ....................................................................................................................................................................e.............................|........................................................................................................................................................................................................ + // add v8.4s, v8.4s, v12.4s // ......................................................................................................................................................................e...........................|........................................................................................................................................................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ............................................................................................................................................................................e.....................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................................................e..................|........................................................................................................................................................................................................ + // mls v12.4s, v24.4s, v29.4s // ....................................................................................................................................................................................e.............|........................................................................................................................................................................................................ + // sub v24.4s, v9.4s, v13.4s // .........................................................................................................................................................e........................................|........................................................................................................................................................................................................ + // add v9.4s, v9.4s, v13.4s // ...........................................................................................................................................................e......................................|........................................................................................................................................................................................................ + // mul v13.4s, v24.4s, v0.s[2] // ............................................................................................................................................................e.....................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................................................e....................................|........................................................................................................................................................................................................ + // mls v13.4s, v24.4s, v29.4s // .................................................................................................................................................................e................................|........................................................................................................................................................................................................ + // sub v24.4s, v10.4s, v14.4s // ...................................................................................................................................................................e..............................|........................................................................................................................................................................................................ + // add v10.4s, v10.4s, v14.4s // ..................................................................................................................................................................e...............................|........................................................................................................................................................................................................ + // mul v14.4s, v24.4s, v0.s[2] // ..................................................................................................................................................................................................|.....*.................................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................................................................................|......*................................................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|..........*............................................................................................................................................................................................. + // sub v24.4s, v11.4s, v15.4s // ..................................................................................................................................................................................................|.................*...................................................................................................................................................................................... + // add v11.4s, v11.4s, v15.4s // ..................................................................................................................................................................................................|................*....................................................................................................................................................................................... + // mul v15.4s, v24.4s, v0.s[2] // ..................................................................................................................................................................................................|....................*................................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................................................................................|................................*....................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.....................................*.................................................................................................................................................................. + // sub v24.4s, v16.4s, v20.4s // .................................................................................................................................e................................................................|........................................................................................................................................................................................................ + // add v16.4s, v16.4s, v20.4s // ................................................................................................................................e.................................................................|........................................................................................................................................................................................................ + // mul v20.4s, v24.4s, v1.s[0] // ......................................................................................................................................................................................e...........|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................................................................................................................................................e..........|........................................................................................................................................................................................................ + // mls v20.4s, v24.4s, v29.4s // ...........................................................................................................................................................................................e......|........................................................................................................................................................................................................ + // sub v24.4s, v17.4s, v21.4s // .......................................................................................................................e..........................................................................|........................................................................................................................................................................................................ + // add v17.4s, v17.4s, v21.4s // ........................................................................................................................e.........................................................................|........................................................................................................................................................................................................ + // mul v21.4s, v24.4s, v1.s[0] // ...........................................................................................................................e......................................................................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................e.......................................................................|........................................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ...............................................................................................................................e..................................................................|........................................................................................................................................................................................................ + // sub v24.4s, v18.4s, v22.4s // ..........................................................................................................................................................e.......................................|........................................................................................................................................................................................................ + // add v18.4s, v18.4s, v22.4s // ...............................................................................................................................................................e..................................|........................................................................................................................................................................................................ + // mul v22.4s, v24.4s, v1.s[0] // .........................................................................................................................................................................e........................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................................e...................|........................................................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ..................................................................................................................................................................................e...............|........................................................................................................................................................................................................ + // sub v24.4s, v19.4s, v23.4s // .....................................................................................................................................................................................e............|........................................................................................................................................................................................................ + // add v19.4s, v19.4s, v23.4s // ...................................................................................................................................................................................e..............|........................................................................................................................................................................................................ + // mul v23.4s, v24.4s, v1.s[0] // .................................................................................................................................................................................................e|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................................................................................................................................................................e.|........................................................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.*...................................................................................................................................................................................................... + // sub v24.4s, v8.4s, v16.4s // ..................................................................................................................................................................................................|...........................*............................................................................................................................................................................ + // add v8.4s, v8.4s, v16.4s // ..................................................................................................................................................................................................|............................*........................................................................................................................................................................... + // mul v16.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|..............................*......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|...............................*........................................................................................................................................................................ + // mls v16.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...................................*.................................................................................................................................................................... + // sub v24.4s, v9.4s, v17.4s // ...........................................................................................................................................................................e......................|........................................................................................................................................................................................................ + // add v9.4s, v9.4s, v17.4s // ..........................................................................................................................................................................e.......................|........................................................................................................................................................................................................ + // mul v17.4s, v24.4s, v0.s[0] // ................................................................................................................................................................................e.................|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................................................................................................................................................e................|........................................................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // ........................................................................................................................................................................................e.........|........................................................................................................................................................................................................ + // sub v24.4s, v10.4s, v18.4s // ..................................................................................................................................................................................................|..*..................................................................................................................................................................................................... + // add v10.4s, v10.4s, v18.4s // ..................................................................................................................................................................................................|.......*................................................................................................................................................................................................ + // mul v18.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|..............*......................................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|............*........................................................................................................................................................................................... + // mls v18.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|..................*..................................................................................................................................................................................... + // sub v24.4s, v11.4s, v19.4s // ..................................................................................................................................................................................................|........................................................*............................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ..................................................................................................................................................................................................|.................................................*...................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|.....................................................................*.................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|.................................................................*...................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.........................................................................*.............................................................................................................................. + // sub v24.4s, v12.4s, v20.4s // ..................................................................................................................................................................................................|.....................*.................................................................................................................................................................................. + // add v12.4s, v12.4s, v20.4s // ..................................................................................................................................................................................................|...................*.................................................................................................................................................................................... + // mul v20.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|..................................................................................*..................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|..........................................................*............................................................................................................................................. + // mls v20.4s, v24.4s, v29.4s // ..........*.......................................................................................................................................................................................|...............................................................................................*........................................................................................................ + // sub v24.4s, v13.4s, v21.4s // .....................................................................................................................................................................e............................|........................................................................................................................................................................................................ + // add v13.4s, v13.4s, v21.4s // ........................................................................................................................................................................e.........................|........................................................................................................................................................................................................ + // mul v21.4s, v24.4s, v0.s[0] // ..........................................................................................................................................................................................e.......|........................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................................................................................................e........|........................................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ..............................................................................................................................................................................................e...|........................................................................................................................................................................................................ + // sub v24.4s, v14.4s, v22.4s // ..................................................................................................................................................................................................|.......................*................................................................................................................................................................................ + // add v14.4s, v14.4s, v22.4s // ..................................................................................................................................................................................................|......................*................................................................................................................................................................................. + // mul v22.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...................................................................................*.................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|....................................................................................*................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ..*...............................................................................................................................................................................................|.......................................................................................*................................................................................................................ + // sub v24.4s, v15.4s, v23.4s // ..................................................................................................................................................................................................|............................................................................*........................................................................................................................... + // add v15.4s, v15.4s, v23.4s // ..................................................................................................................................................................................................|...............................................................*........................................................................................................................................ + // mul v23.4s, v24.4s, v0.s[0] // ........................................*.........................................................................................................................................................|.............................................................................................................................*.......................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|................................................................................*....................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // .....................................................*............................................................................................................................................|..........................................................................................................................................*............................................................. + // cmge v27.4s, v31.4s, v16.4s // ..................................................................................................................................................................................................|.......................................*................................................................................................................................................................ + // cmge v28.4s, v16.4s, v30.4s // ..................................................................................................................................................................................................|........................................*............................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|..........................................*............................................................................................................................................................. + // mls v16.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|....................................................*................................................................................................................................................... + // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................e.....|........................................................................................................................................................................................................ + // cmge v28.4s, v17.4s, v30.4s // .............................................................................................................................................................................................e....|........................................................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ...............................................................................................................................................................................................e..|........................................................................................................................................................................................................ + // mls v17.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|...*.................................................................................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // .........*........................................................................................................................................................................................|..............................................................................................*......................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ..................................................................................................................................................................................................|.....................................................*.................................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ............*.....................................................................................................................................................................................|.................................................................................................*...................................................................................................... + // mls v18.4s, v28.4s, v29.4s // ...............*..................................................................................................................................................................................|....................................................................................................*................................................................................................... + // cmge v27.4s, v31.4s, v19.4s // .......................*..........................................................................................................................................................................|............................................................................................................*........................................................................................... + // cmge v28.4s, v19.4s, v30.4s // .......*..........................................................................................................................................................................................|............................................................................................*........................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .........................*........................................................................................................................................................................|..............................................................................................................*......................................................................................... + // mls v19.4s, v28.4s, v29.4s // ..................................*...............................................................................................................................................................|.......................................................................................................................*................................................................................ + // cmge v27.4s, v31.4s, v20.4s // .........................................*........................................................................................................................................................|..............................................................................................................................*......................................................................... + // cmge v28.4s, v20.4s, v30.4s // ........................*.........................................................................................................................................................................|.............................................................................................................*.......................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................*......................................................................................................................................................|................................................................................................................................*....................................................................... + // mls v20.4s, v28.4s, v29.4s // .............................................*....................................................................................................................................................|..................................................................................................................................*..................................................................... + // cmge v27.4s, v31.4s, v21.4s // ..................................................................................................................................................................................................|.............*.......................................................................................................................................................................................... + // cmge v28.4s, v21.4s, v30.4s // ..................................................................................................................................................................................................|.......................................................*................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.........................................................*.............................................................................................................................................. + // mls v21.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|............................................................*........................................................................................................................................... + // cmge v27.4s, v31.4s, v22.4s // ......*...........................................................................................................................................................................................|...........................................................................................*............................................................................................................ + // cmge v28.4s, v22.4s, v30.4s // .....*............................................................................................................................................................................................|..........................................................................................*............................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ........*.........................................................................................................................................................................................|.............................................................................................*.......................................................................................................... + // mls v22.4s, v28.4s, v29.4s // ...........*......................................................................................................................................................................................|................................................................................................*....................................................................................................... + // cmge v27.4s, v31.4s, v23.4s // ...........................................................................*......................................................................................................................|................................................................................................................................................................*....................................... + // cmge v28.4s, v23.4s, v30.4s // ....................................................................*.............................................................................................................................|.........................................................................................................................................................*.............................................. + // sub v28.4s, v27.4s, v28.4s // ...............................................................................*..................................................................................................................|....................................................................................................................................................................*................................... + // mls v23.4s, v28.4s, v29.4s // ..............................................................................................*...................................................................................................|...................................................................................................................................................................................*.................... + // str q16, [x1, #(8*(512/8))] // ..................................................................................................................................................................................................|.............................................................................*.......................................................................................................................... + // str q17, [x1, #(9*(512/8))] // ..................................................................................................................................................................................................|...............*........................................................................................................................................................................................ + // str q18, [x1, #(10*(512/8))] // ...................*..............................................................................................................................................................................|........................................................................................................*............................................................................................... + // str q19, [x1, #(11*(512/8))] // ......................................*...........................................................................................................................................................|...........................................................................................................................*............................................................................ + // str q20, [x1, #(12*(512/8))] // ................................................................................*.................................................................................................................|.....................................................................................................................................................................*.................................. + // str q21, [x1, #(13*(512/8))] // .............*....................................................................................................................................................................................|..................................................................................................*..................................................................................................... + // str q22, [x1, #(14*(512/8))] // ................*.................................................................................................................................................................................|.....................................................................................................*.................................................................................................. + // str q23, [x1, #(15*(512/8))] // ..................................................................................................................*...............................................................................|.......................................................................................................................................................................................................* + // mul v16.4s, v8.4s, v25.4s // ..................................................................................................................................................................................................|..............................................*......................................................................................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // ..................................................................................................................................................................................................|...............................................*........................................................................................................................................................ + // mls v16.4s, v8.4s, v29.4s // ..................................................................................................................................................................................................|...................................................*.................................................................................................................................................... + // mul v17.4s, v9.4s, v25.4s // ..................................................................................................................................................................................................|............................................*........................................................................................................................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ..................................................................................................................................................................................................|...........................................*............................................................................................................................................................ + // mls v17.4s, v9.4s, v29.4s // ..................................................................................................................................................................................................|................................................*....................................................................................................................................................... + // mul v18.4s, v10.4s, v25.4s // ..................................................................................................................................................................................................|..................................................*..................................................................................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................................................................................................................................|..............................................................*......................................................................................................................................... + // mls v18.4s, v10.4s, v29.4s // ..................................................................................................................................................................................................|..................................................................*..................................................................................................................................... + // mul v19.4s, v11.4s, v25.4s // ..........................................*.......................................................................................................................................................|...............................................................................................................................*........................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // ................................................*.................................................................................................................................................|.....................................................................................................................................*.................................................................. + // mls v19.4s, v11.4s, v29.4s // ...................................................*..............................................................................................................................................|........................................................................................................................................*............................................................... + // mul v20.4s, v12.4s, v25.4s // ..............*...................................................................................................................................................................................|...................................................................................................*.................................................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // .................*................................................................................................................................................................................|......................................................................................................*................................................................................................. + // mls v20.4s, v12.4s, v29.4s // ........................................................*.........................................................................................................................................|.............................................................................................................................................*.......................................................... + // mul v21.4s, v13.4s, v25.4s // ..................................................................................................................................................................................................*........................................................................................................................................................................................................ + // sqrdmulh v13.4s, v13.4s, v26.4s // ..................................................................................................................................................................................................|*....................................................................................................................................................................................................... + // mls v21.4s, v13.4s, v29.4s // ..................................................................................................................................................................................................|....*................................................................................................................................................................................................... + // mul v22.4s, v14.4s, v25.4s // ..................................................................................................................................................................................................|..........................*............................................................................................................................................................................. + // sqrdmulh v14.4s, v14.4s, v26.4s // ..................................................................................................................................................................................................|.........................*.............................................................................................................................................................................. + // mls v22.4s, v14.4s, v29.4s // ..................................................................................................................................................................................................|.............................*.......................................................................................................................................................................... + // mul v23.4s, v15.4s, v25.4s // ..................................................................................................................................................................................................|...................................................................*.................................................................................................................................... + // sqrdmulh v15.4s, v15.4s, v26.4s // ..................................................................................................................................................................................................|......................................................................*................................................................................................................................. + // mls v23.4s, v15.4s, v29.4s // ..................................................................................................................................................................................................|..........................................................................*............................................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // .......................................................*..........................................................................................................................................|............................................................................................................................................*........................................................... + // cmge v28.4s, v16.4s, v30.4s // ......................................................*...........................................................................................................................................|...........................................................................................................................................*............................................................ + // sub v28.4s, v27.4s, v28.4s // .........................................................*........................................................................................................................................|..............................................................................................................................................*......................................................... + // mls v16.4s, v28.4s, v29.4s // .................................................................*................................................................................................................................|......................................................................................................................................................*................................................. + // cmge v27.4s, v31.4s, v17.4s // ..................................................................................................................................................................................................|...........................................................*............................................................................................................................................ + // cmge v28.4s, v17.4s, v30.4s // ..................................................................................................................................................................................................|......................................................*................................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.............................................................*.......................................................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|................................................................*....................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ..................................................................................................................................................................................................|........................................................................*............................................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ..................................................................................................................................................................................................|.......................................................................*................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|...........................................................................*............................................................................................................................ + // mls v18.4s, v28.4s, v29.4s // .*................................................................................................................................................................................................|......................................................................................*................................................................................................................. + // cmge v27.4s, v31.4s, v19.4s // ......................................................................*...........................................................................................................................|...........................................................................................................................................................*............................................ + // cmge v28.4s, v19.4s, v30.4s // ...............................................................................................*..................................................................................................|....................................................................................................................................................................................*................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................*..............................................................................................|........................................................................................................................................................................................*............... + // mls v19.4s, v28.4s, v29.4s // ......................................................................................................*...........................................................................................|...........................................................................................................................................................................................*............ + // cmge v27.4s, v31.4s, v20.4s // ...........................................................*......................................................................................................................................|................................................................................................................................................*....................................................... + // cmge v28.4s, v20.4s, v30.4s // ............................................................*.....................................................................................................................................|.................................................................................................................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................*...................................................................................................................................|...................................................................................................................................................*.................................................... + // mls v20.4s, v28.4s, v29.4s // ................................................................*.................................................................................................................................|.....................................................................................................................................................*.................................................. + // cmge v27.4s, v31.4s, v21.4s // ..................................................................................................................................................................................................|........*............................................................................................................................................................................................... + // cmge v28.4s, v21.4s, v30.4s // ..................................................................................................................................................................................................|.........*.............................................................................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|...........*............................................................................................................................................................................................ + // mls v21.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|........................*............................................................................................................................................................................... + // cmge v27.4s, v31.4s, v22.4s // ..................................................................................................................................................................................................|.................................*...................................................................................................................................................................... + // cmge v28.4s, v22.4s, v30.4s // ..................................................................................................................................................................................................|....................................*................................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|......................................*................................................................................................................................................................. + // mls v22.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.........................................*.............................................................................................................................................................. + // cmge v27.4s, v31.4s, v23.4s // ..................................................................................................................................................................................................|..............................................................................*......................................................................................................................... + // cmge v28.4s, v23.4s, v30.4s // ..................................................................................................................................................................................................|...............................................................................*........................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.................................................................................*...................................................................................................................... + // mls v23.4s, v28.4s, v29.4s // ....*.............................................................................................................................................................................................|.........................................................................................*.............................................................................................................. + // str q16, [x1], #(16) // .....................................................................*............................................................................................................................|..........................................................................................................................................................*............................................. + // str q17, [x1, #(-16 + 1*(512/8))] // ..................................................................................................................................................................................................|....................................................................*................................................................................................................................... + // str q18, [x1, #(-16 + 2*(512/8))] // .......................................................................*..........................................................................................................................|............................................................................................................................................................*........................................... + // str q19, [x1, #(-16 + 3*(512/8))] // ................................................................................................................*.................................................................................|.....................................................................................................................................................................................................*.. + // str q20, [x1, #(-16 + 4*(512/8))] // ...................................................................*..............................................................................................................................|........................................................................................................................................................*............................................... + // str q21, [x1, #(-16 + 5*(512/8))] // ..................................................................................................................................................................................................|..................................*..................................................................................................................................................................... + // str q22, [x1, #(-16 + 6*(512/8))] // ..................................................................................................................................................................................................|.............................................*.......................................................................................................................................................... + // str q23, [x1, #(-16 + 7*(512/8))] // ......................*...........................................................................................................................................................................|...........................................................................................................*............................................................................................ + + sub count, count, #1 + cbnz count, layer1234_start + sub v21.4S, v16.4S, v22.4S // ...............................................................................................................*........................................................................................................................................................................ + mls v19.4S, v11.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mls v23.4S, v27.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sqrdmulh v11.4S, v21.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + mul v27.4S, v21.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + sub v21.4S, v12.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... + str q23, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + add v23.4S, v16.4S, v22.4S // ................................................................................................................*....................................................................................................................................................................... + mls v27.4S, v11.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + add v12.4S, v12.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... + add v11.4S, v24.4S, v13.4S // .........................................................................................................................................*.............................................................................................................................................. + sub v22.4S, v24.4S, v13.4S // ........................................................................................................................................*............................................................................................................................................... + add v24.4S, v27.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... + mul v20.4S, v17.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sqrdmulh v13.4S, v22.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + sqrdmulh v16.4S, v17.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mul v17.4S, v22.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sqrdmulh v22.4S, v11.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v27.4S, v27.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v20.4S, v16.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mls v17.4S, v13.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + mul v13.4S, v27.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + mul v19.4S, v11.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v16.4S, v27.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + cmge v27.4S, v17.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v11.4S, v31.4S, v17.4S // ................................................................................................................................................................................*....................................................................................................... + mls v19.4S, v22.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v27.4S, v11.4S, v27.4S // ..................................................................................................................................................................................*..................................................................................................... + mls v13.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sqrdmulh v22.4S, v8.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mul v16.4S, v8.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + mls v17.4S, v27.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + cmge v27.4S, v13.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v8.4S, v31.4S, v13.4S // ............................................................................................................................................................................................................*........................................................................... + mls v16.4S, v22.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sub v8.4S, v8.4S, v27.4S // ..............................................................................................................................................................................................................*......................................................................... + cmge v22.4S, v31.4S, v15.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v11.4S, v15.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + mls v13.4S, v8.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + sub v11.4S, v22.4S, v11.4S // ......................................................................................................................................................................................................*................................................................................. + sub v8.4S, v20.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. + cmge v27.4S, v16.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mls v15.4S, v11.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + add v14.4S, v20.4S, v14.4S // .......................................................................................................................................................................*................................................................................................................ + sqrdmulh v20.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + cmge v22.4S, v31.4S, v16.4S // ....................................................................................................................................................................................................................................................................*................... + str q17, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sub v11.4S, v22.4S, v27.4S // ......................................................................................................................................................................................................................................................................*................. + mul v27.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + sqrdmulh v22.4S, v8.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mls v16.4S, v11.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sqrdmulh v12.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mls v27.4S, v20.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mul v8.4S, v8.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + str q16, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + sqrdmulh v11.4S, v24.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + sub v20.4S, v23.4S, v10.4S // .......................................................................................................................................................*................................................................................................................................ + mls v8.4S, v22.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + mul v17.4S, v24.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + mul v24.4S, v20.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v20.4S, v20.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + cmge v16.4S, v8.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v14.4S, v31.4S, v8.4S // ........................................................................................................................................................................................................*............................................................................... + mls v24.4S, v20.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sub v14.4S, v14.4S, v16.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v22.4S, v12.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v16.4S, v28.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v8.4S, v14.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mls v17.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + mul v12.4S, v16.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v16.4S, v16.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + str q8, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + sqrdmulh v20.4S, v21.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + cmge v14.4S, v17.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v12.4S, v16.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + cmge v8.4S, v31.4S, v17.4S // ............................................................................................................................................................................................................................................................................*........... + mul v11.4S, v21.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sub v8.4S, v8.4S, v14.4S // ..............................................................................................................................................................................................................................................................................*......... + cmge v14.4S, v31.4S, v12.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v21.4S, v12.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v17.4S, v8.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + sub v16.4S, v14.4S, v21.4S // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v14.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v11.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + cmge v21.4S, v19.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v12.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v16.4S, v31.4S, v19.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v8.4S, v11.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + cmge v20.4S, v31.4S, v11.4S // ................................................................................................................................................................................................*....................................................................................... + str q12, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sub v8.4S, v20.4S, v8.4S // ..................................................................................................................................................................................................*..................................................................................... + sub v21.4S, v16.4S, v21.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v16.4S, v31.4S, v27.4S // ................................................................................................................................................................................................................................................................*....................... + mls v11.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + mls v19.4S, v21.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v12.4S, v31.4S, v24.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v21.4S, v27.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + str q11, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v16.4S, v16.4S, v21.4S // ..................................................................................................................................................................................................................................................................*..................... + str q19, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v21.4S, v24.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v27.4S, v16.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + cmge v16.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + sub v11.4S, v12.4S, v21.4S // ..............................................................................................................................................................................................*......................................................................................... + cmge v21.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q27, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v24.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mul v11.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sub v16.4S, v16.4S, v21.4S // ..........................................................................................................................................................................................................................................................................*............. + add v12.4S, v28.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + str q24, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + mls v22.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + mul v24.4S, v12.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mls v11.4S, v14.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sqrdmulh v14.4S, v12.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + add v27.4S, v23.4S, v10.4S // ........................................................................................................................................................*............................................................................................................................... + cmge v16.4S, v11.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + mls v24.4S, v14.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mul v22.4S, v27.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v10.4S, v27.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + cmge v14.4S, v31.4S, v11.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v19.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v21.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v22.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v21.4S, v19.4S, v21.4S // ..........................................................................................................................................................................................................................................................*............................. + sub v28.4S, v14.4S, v16.4S // ......................................................................................................................................................................................................................................................*................................. + str q17, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + mls v24.4S, v21.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + str q13, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + cmge v23.4S, v22.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v20.4S, v31.4S, v22.4S // ............................................................................................................................................................................................................................................................*........................... + str q24, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v17.4S, v20.4S, v23.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v11.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + str q15, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. + mls v22.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + str q11, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + str q22, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a72.s b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a72.s new file mode 100644 index 00000000..568e8ff2 --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_a72.s @@ -0,0 +1,1810 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_manual_ld4_opt_a72 + .global _intt_dilithium_1234_5678_manual_ld4_opt_a72 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_manual_ld4_opt_a72: +_intt_dilithium_1234_5678_manual_ld4_opt_a72: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ldr q4, [x0, #48] // .*................................................. + ldr q2, [x0, #32] // *.................................................. + // gap // ................................................... + ldr q22, [x0, #0] // ..*................................................ + ldr q3, [x0, #16] // ...*............................................... + // gap // ................................................... + ldr q30, [x4], #8 // ........................................*.......... + ldr q10, [x3, #80] // ......*............................................ + // gap // ................................................... + ldr q12, [x3, #32] // ....*.............................................. + // gap // ................................................... + // gap // ................................................... + trn2 v14.4S, v2.4S, v4.4S // ...........*....................................... + trn1 v2.4S, v2.4S, v4.4S // ........*.......................................... + ldr q19, [x3, #48] // .............*..................................... + trn2 v0.4S, v22.4S, v3.4S // ..........*........................................ + trn1 v25.4S, v22.4S, v3.4S // .........*......................................... + ldr q7, [x4], #16 // ..........................................*........ + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + trn2 v5.2D, v0.2D, v14.2D // ...............*................................... + trn2 v11.2D, v25.2D, v2.2D // .................*................................. + // gap // ................................................... + trn1 v28.2D, v25.2D, v2.2D // ............*...................................... + trn1 v3.2D, v0.2D, v14.2D // ..............*.................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sub v15.4S, v11.4S, v5.4S // ....................*.............................. + // gap // ................................................... + // gap // ................................................... + add v8.4S, v28.4S, v3.4S // ...................*............................... + // gap // ................................................... + // gap // ................................................... + sub v2.4S, v28.4S, v3.4S // ................*.................................. + // gap // ................................................... + // gap // ................................................... + sqrdmulh v13.4S, v15.4S, v10.4S // .......................*........................... + ldr q10, [x3, #64] // .....*............................................. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sqrdmulh v18.4S, v2.4S, v19.4S // .....................*............................. + // gap // ................................................... + // gap // ................................................... + add v28.4S, v11.4S, v5.4S // ......................*............................ + // gap // ................................................... + // gap // ................................................... + mul v2.4S, v2.4S, v12.4S // ..................*................................ + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mul v15.4S, v15.4S, v10.4S // .........................*......................... + ldr q10, [x3, #16] // .......*........................................... + sub v3.4S, v8.4S, v28.4S // ..........................*........................ + add v31.4S, v8.4S, v28.4S // ........................*.......................... + // gap // ................................................... + // gap // ................................................... + mls v2.4S, v18.4S, v29.4S // ...........................*....................... + ldr q18, [x3], #(6*16) // .............................*..................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mls v15.4S, v13.4S, v29.4S // ............................*...................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sqrdmulh v22.4S, v3.4S, v10.4S // ..............................*.................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mul v21.4S, v3.4S, v18.4S // ................................*.................. + // gap // ................................................... + // gap // ................................................... + sub v5.4S, v2.4S, v15.4S // ...............................*................... + // gap // ................................................... + // gap // ................................................... + add v11.4S, v2.4S, v15.4S // ....................................*.............. + // gap // ................................................... + // gap // ................................................... + mls v21.4S, v22.4S, v29.4S // ..................................*................ + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sqrdmulh v2.4S, v5.4S, v10.4S // .................................*................. + trn1 v0.4S, v31.4S, v11.4S // .......................................*........... + // gap // ................................................... + trn2 v11.4S, v31.4S, v11.4S // ......................................*............ + // gap // ................................................... + // gap // ................................................... + mul v18.4S, v5.4S, v18.4S // ...................................*............... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mls v18.4S, v2.4S, v29.4S // .....................................*............. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + trn2 v2.4S, v21.4S, v18.4S // ...........................................*....... + trn1 v27.4S, v21.4S, v18.4S // .........................................*......... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + trn2 v15.2D, v11.2D, v2.2D // .............................................*..... + trn2 v24.2D, v0.2D, v27.2D // ............................................*...... + trn1 v13.2D, v0.2D, v27.2D // ..............................................*.... + // gap // ................................................... + // gap // ................................................... + trn1 v27.2D, v11.2D, v2.2D // ...............................................*... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + sub v19.4S, v24.4S, v15.4S // ................................................*.. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mul v18.4S, v19.4S, v7.S[2] // ..................................................* + add v11.4S, v13.4S, v27.4S // .................................................*. + + // original source code + // ldr q21, [x0, #32] // .*................................................. + // ldr q16, [x0, #48] // *.................................................. + // ldr q25, [x0, #0] // ..*................................................ + // ldr q2, [x0, #16] // ...*............................................... + // ldr q28, [x3, #32] // ......*............................................ + // ldr q4, [x3, #64] // .....................*............................. + // ldr q13, [x3, #80] // .....*............................................. + // ldr q23, [x3, #16] // ..........................*........................ + // trn1 v14.4S, v21.4S, v16.4S // ........*.......................................... + // trn1 v19.4S, v25.4S, v2.4S // ...........*....................................... + // trn2 v9.4S, v25.4S, v2.4S // ..........*........................................ + // trn2 v17.4S, v21.4S, v16.4S // .......*........................................... + // trn1 v12.2D, v19.2D, v14.2D // ...............*................................... + // ldr q22, [x3, #48] // .........*......................................... + // trn1 v20.2D, v9.2D, v17.2D // ................*.................................. + // trn2 v1.2D, v9.2D, v17.2D // .............*..................................... + // sub v17.4S, v12.4S, v20.4S // ...................*............................... + // trn2 v2.2D, v19.2D, v14.2D // ..............*.................................... + // mul v8.4S, v17.4S, v28.4S // ........................*.......................... + // add v7.4S, v12.4S, v20.4S // ..................*................................ + // sub v6.4S, v2.4S, v1.4S // .................*................................. + // sqrdmulh v10.4S, v17.4S, v22.4S // ......................*............................ + // add v5.4S, v2.4S, v1.4S // .......................*........................... + // sqrdmulh v2.4S, v6.4S, v13.4S // ....................*.............................. + // add v31.4S, v7.4S, v5.4S // ............................*...................... + // mul v6.4S, v6.4S, v4.4S // .........................*......................... + // sub v28.4S, v7.4S, v5.4S // ...........................*....................... + // mls v8.4S, v10.4S, v29.4S // .............................*..................... + // mls v6.4S, v2.4S, v29.4S // ...............................*................... + // ldr q2, [x3], #(6*16) // ..............................*.................... + // sqrdmulh v10.4S, v28.4S, v23.4S // ................................*.................. + // sub v16.4S, v8.4S, v6.4S // ..................................*................ + // mul v22.4S, v28.4S, v2.4S // .................................*................. + // sqrdmulh v12.4S, v16.4S, v23.4S // .....................................*............. + // mls v22.4S, v10.4S, v29.4S // ....................................*.............. + // mul v23.4S, v16.4S, v2.4S // ........................................*.......... + // add v2.4S, v8.4S, v6.4S // ...................................*............... + // mls v23.4S, v12.4S, v29.4S // .........................................*......... + // trn2 v19.4S, v31.4S, v2.4S // .......................................*........... + // trn1 v2.4S, v31.4S, v2.4S // ......................................*............ + // ldr q30, [x4], #8 // ....*.............................................. + // trn1 v18.4S, v22.4S, v23.4S // ...........................................*....... + // ldr q7, [x4], #16 // ............*...................................... + // trn2 v14.4S, v22.4S, v23.4S // ..........................................*........ + // trn2 v24.2D, v2.2D, v18.2D // .............................................*..... + // trn2 v15.2D, v19.2D, v14.2D // ............................................*...... + // trn1 v13.2D, v2.2D, v18.2D // ..............................................*.... + // trn1 v27.2D, v19.2D, v14.2D // ...............................................*... + // sub v19.4S, v24.4S, v15.4S // ................................................*.. + // add v11.4S, v13.4S, v27.4S // ..................................................* + // mul v18.4S, v19.4S, v7.S[2] // .................................................*. + + sub count, count, #1 +layer5678_start: + sub v26.4S, v13.4S, v27.4S // ................................................*........................... + ldr q21, [x0, #96] // ..e......................................................................... + ldr q16, [x0, #112] // ...e........................................................................ + ldr q25, [x0, #64] // e........................................................................... + ldr q2, [x0, #80] // .e.......................................................................... + add v8.4S, v24.4S, v15.4S // ......................................................*..................... + sqrdmulh v10.4S, v19.4S, v7.S[3] // ........................................................*................... + ldr q28, [x3, #32] // ..............e............................................................. + ldr q4, [x3, #64] // ................e........................................................... + ldr q13, [x3, #80] // .................e.......................................................... + ldr q23, [x3, #16] // .............e.............................................................. + // gap // ............................................................................ + trn1 v14.4S, v21.4S, v16.4S // ......e..................................................................... + sqrdmulh v22.4S, v26.4S, v7.S[1] // ...................................................*........................ + // gap // ............................................................................ + trn1 v19.4S, v25.4S, v2.4S // ....e....................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v9.4S, v25.4S, v2.4S // .....e...................................................................... + mul v26.4S, v26.4S, v7.S[0] // ..................................................*......................... + // gap // ............................................................................ + trn2 v17.4S, v21.4S, v16.4S // .......e.................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v18.4S, v10.4S, v29.4S // .........................................................*.................. + trn1 v12.2D, v19.2D, v14.2D // ..........e................................................................. + // gap // ............................................................................ + sub v24.4S, v11.4S, v8.4S // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + mls v26.4S, v22.4S, v29.4S // ....................................................*....................... + ldr q22, [x3, #48] // ...............e............................................................ + trn1 v20.2D, v9.2D, v17.2D // ...........e................................................................ + trn2 v1.2D, v9.2D, v17.2D // .........e.................................................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v3.4S, v11.4S, v8.4S // ...........................................................*................ + mul v0.4S, v24.4S, v30.S[0] // ............................................................*............... + // gap // ............................................................................ + sub v17.4S, v12.4S, v20.4S // ..................e......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v15.4S, v24.4S, v30.S[1] // .............................................................*.............. + trn2 v2.2D, v19.2D, v14.2D // ........e................................................................... + // gap // ............................................................................ + srshr v21.4S, v3.4S, #23 // ....................................................................*....... + // gap // ............................................................................ + // gap // ............................................................................ + mul v8.4S, v17.4S, v28.4S // ....................e....................................................... + add v7.4S, v12.4S, v20.4S // ...................e........................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v6.4S, v2.4S, v1.4S // .......................e.................................................... + sqrdmulh v10.4S, v17.4S, v22.4S // .....................e...................................................... + add v5.4S, v2.4S, v1.4S // ........................e................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v2.4S, v6.4S, v13.4S // ..........................e................................................. + // gap // ............................................................................ + // gap // ............................................................................ + add v31.4S, v7.4S, v5.4S // .............................e.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + mul v6.4S, v6.4S, v4.4S // .........................e.................................................. + sub v28.4S, v7.4S, v5.4S // ............................e............................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v8.4S, v10.4S, v29.4S // ......................e..................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v6.4S, v2.4S, v29.4S // ...........................e................................................ + ldr q2, [x3], #(6*16) // ............e............................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v0.4S, v15.4S, v29.4S // ..............................................................*............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v10.4S, v28.4S, v23.4S // ...............................e............................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v16.4S, v8.4S, v6.4S // .................................e.......................................... + // gap // ............................................................................ + // gap // ............................................................................ + mul v22.4S, v28.4S, v2.4S // ..............................e............................................. + // gap // ............................................................................ + // gap // ............................................................................ + str q0, [x0, #32] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v12.4S, v16.4S, v23.4S // ....................................e....................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v11.4S, v26.4S, v18.4S // ...............................................................*............ + mls v22.4S, v10.4S, v29.4S // ................................e........................................... + add v10.4S, v26.4S, v18.4S // ................................................................*........... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v23.4S, v16.4S, v2.4S // ...................................e........................................ + add v2.4S, v8.4S, v6.4S // ..................................e......................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v23.4S, v12.4S, v29.4S // .....................................e...................................... + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v19.4S, v31.4S, v2.4S // .......................................e.................................... + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v2.4S, v31.4S, v2.4S // ......................................e..................................... + // gap // ............................................................................ + sqrdmulh v8.4S, v11.4S, v30.S[1] // ..................................................................*......... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v1.4S, v11.4S, v30.S[0] // .................................................................*.......... + // gap // ............................................................................ + ldr q30, [x4], #8 // ..............................................e............................. + trn1 v18.4S, v22.4S, v23.4S // ........................................e................................... + ldr q7, [x4], #16 // ...............................................e............................ + // gap // ............................................................................ + mls v3.4S, v21.4S, v29.4S // .....................................................................*...... + trn2 v14.4S, v22.4S, v23.4S // .........................................e.................................. + // gap // ............................................................................ + srshr v22.4S, v10.4S, #23 // ......................................................................*..... + // gap // ............................................................................ + // gap // ............................................................................ + mls v1.4S, v8.4S, v29.4S // ...................................................................*........ + trn2 v24.2D, v2.2D, v18.2D // ..........................................e................................. + // gap // ............................................................................ + trn2 v15.2D, v19.2D, v14.2D // ...........................................e................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v10.4S, v22.4S, v29.4S // .......................................................................*.... + trn1 v13.2D, v2.2D, v18.2D // ............................................e............................... + // gap // ............................................................................ + trn1 v27.2D, v19.2D, v14.2D // .............................................e.............................. + str q3, [x0], #(16*4) // ........................................................................*... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v19.4S, v24.4S, v15.4S // .....................................................e...................... + str q1, [x0, #-16] // ...........................................................................* + // gap // ............................................................................ + // gap // ............................................................................ + add v11.4S, v13.4S, v27.4S // .................................................e.......................... + // gap // ............................................................................ + // gap // ............................................................................ + str q10, [x0, #-48] // .........................................................................*.. + mul v18.4S, v19.4S, v7.S[2] // .......................................................e.................... + // gap // ............................................................................ + + // original source code + // ldr q8, [x0, #(16*0)] // ..e........................................................................|..e....................................................................... + // ldr q9, [x0, #(16*1)] // ...e.......................................................................|...e...................................................................... + // ldr q10, [x0, #(16*2)] // e..........................................................................|e......................................................................... + // ldr q11, [x0, #(16*3)] // .e.........................................................................|.e........................................................................ + // trn1 v25.4s, v8.4s, v9.4s // ............e..............................................................|............e............................................................. + // trn2 v26.4s, v8.4s, v9.4s // .............e.............................................................|.............e............................................................ + // trn1 v27.4s, v10.4s, v11.4s // ..........e................................................................|..........e............................................................... + // trn2 v28.4s, v10.4s, v11.4s // ...............e...........................................................|...............e.......................................................... + // trn2 v10.2d, v25.2d, v27.2d // ...........................e...............................................|...........................e.............................................. + // trn2 v11.2d, v26.2d, v28.2d // ......................e....................................................|......................e................................................... + // trn1 v8.2d, v25.2d, v27.2d // .................e.........................................................|.................e........................................................ + // trn1 v9.2d, v26.2d, v28.2d // .....................e.....................................................|.....................e.................................................... + // ldr q0, [x3], #(6*16) // ........................................e..................................|........................................e................................. + // ldr q4, [x3, #(-6*16 + 1*16)] // .........e.................................................................|.........e................................................................ + // ldr q1, [x3, #(-6*16 + 2*16)] // ......e....................................................................|......e................................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ....................e......................................................|....................e..................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // .......e...................................................................|.......e.................................................................. + // ldr q6, [x3, #(-6*16 + 5*16)] // ........e..................................................................|........e................................................................. + // sub v24.4s, v8.4s, v9.4s // .........................e.................................................|.........................e................................................ + // add v8.4s, v8.4s, v9.4s // ..............................e............................................|..............................e........................................... + // mul v9.4s, v24.4s, v1.4s // .............................e.............................................|.............................e............................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ................................e..........................................|................................e......................................... + // mls v9.4s, v24.4s, v29.4s // ......................................e....................................|......................................e................................... + // sub v24.4s, v10.4s, v11.4s // ...............................e...........................................|...............................e.......................................... + // add v10.4s, v10.4s, v11.4s // .................................e.........................................|.................................e........................................ + // mul v11.4s, v24.4s, v2.4s // ....................................e......................................|....................................e..................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..................................e........................................|..................................e....................................... + // mls v11.4s, v24.4s, v29.4s // .......................................e...................................|.......................................e.................................. + // sub v24.4s, v8.4s, v10.4s // .....................................e.....................................|.....................................e.................................... + // add v8.4s, v8.4s, v10.4s // ...................................e.......................................|...................................e...................................... + // mul v10.4s, v24.4s, v0.4s // ............................................e..............................|............................................e............................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................................e................................|..........................................e............................... + // mls v10.4s, v24.4s, v29.4s // ................................................e..........................|................................................e......................... + // sub v24.4s, v9.4s, v11.4s // ...........................................e...............................|...........................................e.............................. + // add v9.4s, v9.4s, v11.4s // ...................................................e.......................|...................................................e...................... + // mul v11.4s, v24.4s, v0.4s // ..................................................e........................|..................................................e....................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................e............................|..............................................e........................... + // mls v11.4s, v24.4s, v29.4s // ....................................................e......................|....................................................e..................... + // trn1 v25.4s, v8.4s, v9.4s // ......................................................e....................|......................................................e................... + // trn2 v26.4s, v8.4s, v9.4s // .....................................................e.....................|.....................................................e.................... + // trn1 v27.4s, v10.4s, v11.4s // ..........................................................e................|..........................................................e............... + // trn2 v28.4s, v10.4s, v11.4s // .............................................................e.............|.............................................................e............ + // trn2 v10.2d, v25.2d, v27.2d // ................................................................e..........|................................................................e......... + // trn2 v11.2d, v26.2d, v28.2d // .................................................................e.........|.................................................................e........ + // trn1 v8.2d, v25.2d, v27.2d // ...................................................................e.......|...................................................................e...... + // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e......|....................................................................e..... + // ldr q1, [x4], #8 // .........................................................e.................|.........................................................e................ + // ldr q0, [x4], #16 // ...........................................................e...............|...........................................................e.............. + // sub v24.4s, v8.4s, v9.4s // ...........................................................................*.......................................................................... + // add v8.4s, v8.4s, v9.4s // ........................................................................e..|........................................................................e. + // mul v9.4s, v24.4s, v0.s[0] // ..............*............................................................|..............*........................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........*...............................................................|...........*.............................................................. + // mls v9.4s, v24.4s, v29.4s // ...................*.......................................................|...................*...................................................... + // sub v24.4s, v10.4s, v11.4s // ......................................................................e....|......................................................................e... + // add v10.4s, v10.4s, v11.4s // ....*......................................................................|....*..................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ..........................................................................e|.......................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....*.....................................................................|.....*.................................................................... + // mls v11.4s, v24.4s, v29.4s // ................*..........................................................|................*......................................................... + // sub v24.4s, v8.4s, v10.4s // ..................*........................................................|..................*....................................................... + // add v8.4s, v8.4s, v10.4s // .......................*...................................................|.......................*.................................................. + // mul v10.4s, v24.4s, v1.s[0] // ........................*..................................................|........................*................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................*................................................|..........................*............................................... + // mls v10.4s, v24.4s, v29.4s // .........................................*.................................|.........................................*................................ + // sub v24.4s, v9.4s, v11.4s // ...............................................*...........................|...............................................*.......................... + // add v9.4s, v9.4s, v11.4s // .................................................*.........................|.................................................*........................ + // mul v11.4s, v24.4s, v1.s[0] // ........................................................*..................|........................................................*................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................*...................|.......................................................*.................. + // mls v11.4s, v24.4s, v29.4s // ...............................................................*...........|...............................................................*.......... + // srshr v24.4S, v8.4S, #23 // ............................*..............................................|............................*............................................. + // mls v8.4s, v24.4s, v29.4s // ............................................................*..............|............................................................*............. + // srshr v24.4S, v9.4S, #23 // ..............................................................*............|..............................................................*........... + // mls v9.4s, v24.4s, v29.4s // ..................................................................*........|..................................................................*....... + // str q8, [x0], #(16*4) // .....................................................................*.....|.....................................................................*.... + // str q9, [x0, #(-16*4 + 1*16)] // .........................................................................*.|.........................................................................* + // str q10, [x0, #(-16*4 + 2*16)] // .............................................*.............................|.............................................*............................ + // str q11, [x0, #(-16*4 + 3*16)] // .......................................................................*...|.......................................................................*.. + + sub count, count, #1 + cbnz count, layer5678_start + sub v25.4S, v13.4S, v27.4S // *........................ + sqrdmulh v14.4S, v19.4S, v7.S[3] // ..*...................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + add v15.4S, v24.4S, v15.4S // .*....................... + // gap // ......................... + // gap // ......................... + sqrdmulh v17.4S, v25.4S, v7.S[1] // ...*..................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + mul v26.4S, v25.4S, v7.S[0] // ....*.................... + sub v5.4S, v11.4S, v15.4S // ......*.................. + // gap // ......................... + add v24.4S, v11.4S, v15.4S // ........*................ + // gap // ......................... + // gap // ......................... + // gap // ......................... + mls v18.4S, v14.4S, v29.4S // .....*................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + mls v26.4S, v17.4S, v29.4S // .......*................. + srshr v15.4S, v24.4S, #23 // ...........*............. + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + mul v13.4S, v5.4S, v30.S[0] // .........*............... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + sqrdmulh v23.4S, v5.4S, v30.S[1] // ..........*.............. + // gap // ......................... + // gap // ......................... + sub v20.4S, v26.4S, v18.4S // ..............*.......... + // gap // ......................... + // gap // ......................... + mls v24.4S, v15.4S, v29.4S // ..................*...... + add v8.4S, v26.4S, v18.4S // ...............*......... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + sqrdmulh v18.4S, v20.4S, v30.S[1] // ................*........ + // gap // ......................... + // gap // ......................... + srshr v21.4S, v8.4S, #23 // ...................*..... + // gap // ......................... + // gap // ......................... + mul v9.4S, v20.4S, v30.S[0] // .................*....... + // gap // ......................... + // gap // ......................... + str q24, [x0], #(16*4) // ......................*.. + // gap // ......................... + // gap // ......................... + mls v13.4S, v23.4S, v29.4S // ............*............ + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + mls v9.4S, v18.4S, v29.4S // ....................*.... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + mls v8.4S, v21.4S, v29.4S // .....................*... + // gap // ......................... + // gap // ......................... + str q13, [x0, #-32] // .............*........... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + str q9, [x0, #-16] // .......................*. + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + str q8, [x0, #-48] // ........................* + // gap // ......................... + // gap // ......................... + + // original source code + // sub v26.4S, v13.4S, v27.4S // *........................ + // add v8.4S, v24.4S, v15.4S // ..*...................... + // sqrdmulh v10.4S, v19.4S, v7.S[3] // .*....................... + // sqrdmulh v22.4S, v26.4S, v7.S[1] // ...*..................... + // mul v26.4S, v26.4S, v7.S[0] // ....*.................... + // mls v18.4S, v10.4S, v29.4S // .......*................. + // sub v24.4S, v11.4S, v8.4S // .....*................... + // mls v26.4S, v22.4S, v29.4S // ........*................ + // add v3.4S, v11.4S, v8.4S // ......*.................. + // mul v0.4S, v24.4S, v30.S[0] // ..........*.............. + // sqrdmulh v15.4S, v24.4S, v30.S[1] // ...........*............. + // srshr v21.4S, v3.4S, #23 // .........*............... + // mls v0.4S, v15.4S, v29.4S // ...................*..... + // str q0, [x0, #32] // ......................*.. + // sub v11.4S, v26.4S, v18.4S // ............*............ + // add v10.4S, v26.4S, v18.4S // ..............*.......... + // sqrdmulh v8.4S, v11.4S, v30.S[1] // ...............*......... + // mul v1.4S, v11.4S, v30.S[0] // .................*....... + // mls v3.4S, v21.4S, v29.4S // .............*........... + // srshr v22.4S, v10.4S, #23 // ................*........ + // mls v1.4S, v8.4S, v29.4S // ....................*.... + // mls v10.4S, v22.4S, v29.4S // .....................*... + // str q3, [x0], #(16*4) // ..................*...... + // str q1, [x0, #-16] // .......................*. + // str q10, [x0, #-48] // ........................* + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q18, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q10, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + ldr q22, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q27, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q13, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q15, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q24, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + ldr q8, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + sub v11.4S, v18.4S, v10.4S // .........................................*.............................................................................................................................................................................................................................................. + add v18.4S, v18.4S, v10.4S // ..........................................*............................................................................................................................................................................................................................................. + ldr q10, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q23, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + sub v9.4S, v22.4S, v13.4S // ...............................*........................................................................................................................................................................................................................................................ + add v16.4S, v22.4S, v13.4S // ................................*....................................................................................................................................................................................................................................................... + ldr q20, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q22, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + sqrdmulh v17.4S, v11.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + sub v28.4S, v24.4S, v8.4S // ................*....................................................................................................................................................................................................................................................................... + add v24.4S, v24.4S, v8.4S // .................*...................................................................................................................................................................................................................................................................... + mul v19.4S, v11.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sub v8.4S, v23.4S, v27.4S // ....................................*................................................................................................................................................................................................................................................... + add v21.4S, v23.4S, v27.4S // .....................................*.................................................................................................................................................................................................................................................. + add v27.4S, v22.4S, v15.4S // ......................*................................................................................................................................................................................................................................................................. + sqrdmulh v11.4S, v9.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + sub v12.4S, v22.4S, v15.4S // .....................*.................................................................................................................................................................................................................................................................. + mul v13.4S, v9.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + sub v23.4S, v20.4S, v10.4S // ..........................*............................................................................................................................................................................................................................................................. + add v15.4S, v20.4S, v10.4S // ...........................*............................................................................................................................................................................................................................................................ + sqrdmulh v20.4S, v23.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + add v10.4S, v24.4S, v27.4S // .........................................................*.............................................................................................................................................................................................................................. + sub v24.4S, v24.4S, v27.4S // ........................................................*............................................................................................................................................................................................................................... + mul v23.4S, v23.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + add v27.4S, v15.4S, v16.4S // ...................................................................*.................................................................................................................................................................................................................... + sub v22.4S, v15.4S, v16.4S // ..................................................................*..................................................................................................................................................................................................................... + mls v19.4S, v17.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + mls v13.4S, v11.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v11.4S, v8.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + mul v8.4S, v8.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + mul v14.4S, v22.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + sqrdmulh v15.4S, v22.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + sqrdmulh v17.4S, v12.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + mul v22.4S, v12.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + sub v12.4S, v10.4S, v27.4S // ................................................................................................*....................................................................................................................................................................................... + add v27.4S, v10.4S, v27.4S // .................................................................................................*...................................................................................................................................................................................... + sqrdmulh v16.4S, v28.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + mul v9.4S, v28.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + add v28.4S, v21.4S, v18.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v21.4S, v21.4S, v18.4S // ............................................................................*........................................................................................................................................................................................................... + mls v23.4S, v20.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v24.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + mul v10.4S, v24.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + ldr q24, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + sub v20.4S, v23.4S, v13.4S // .......................................................................*................................................................................................................................................................................................................ + add v13.4S, v23.4S, v13.4S // ........................................................................*............................................................................................................................................................................................................... + mls v8.4S, v11.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + ldr q11, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + ldr q23, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mls v22.4S, v17.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + add v17.4S, v8.4S, v19.4S // ..................................................................................*..................................................................................................................................................................................................... + sub v19.4S, v8.4S, v19.4S // .................................................................................*...................................................................................................................................................................................................... + ldr q8, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + mls v9.4S, v16.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + mls v14.4S, v15.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + add v15.4S, v24.4S, v23.4S // ...............................................*........................................................................................................................................................................................................................................ + sub v23.4S, v24.4S, v23.4S // ..............................................*......................................................................................................................................................................................................................................... + mls v10.4S, v18.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sub v16.4S, v11.4S, v8.4S // ...................................................*.................................................................................................................................................................................................................................... + add v8.4S, v11.4S, v8.4S // ....................................................*................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v23.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + add v18.4S, v9.4S, v22.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v11.4S, v9.4S, v22.4S // .............................................................*.......................................................................................................................................................................................................................... + mul v22.4S, v23.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v16.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + mls v22.4S, v24.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + mul v16.4S, v16.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + mls v16.4S, v9.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + sqrdmulh v24.4S, v12.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + add v23.4S, v22.4S, v16.4S // ............................................................................................*........................................................................................................................................................................................... + sub v9.4S, v22.4S, v16.4S // ...........................................................................................*............................................................................................................................................................................................ + sqrdmulh v16.4S, v19.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mul v19.4S, v19.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + add v22.4S, v17.4S, v23.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v23.4S, v17.4S, v23.4S // .........................................................................................................................*.............................................................................................................................................................. + mls v12.4S, v24.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + add v24.4S, v15.4S, v8.4S // .......................................................................................*................................................................................................................................................................................................ + sub v17.4S, v18.4S, v13.4S // .....................................................................................................*.................................................................................................................................................................................. + add v13.4S, v18.4S, v13.4S // ......................................................................................................*................................................................................................................................................................................. + mls v19.4S, v16.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + add v18.4S, v28.4S, v24.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v16.4S, v28.4S, v24.4S // ....................................................................................................................*................................................................................................................................................................... + mul v24.4S, v9.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + sub v28.4S, v15.4S, v8.4S // ......................................................................................*................................................................................................................................................................................................. + sqrdmulh v8.4S, v9.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + add v9.4S, v13.4S, v22.4S // ..............................................................................................................................................*......................................................................................................................................... + sub v22.4S, v13.4S, v22.4S // .............................................................................................................................................*.......................................................................................................................................... + mul v13.4S, v28.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + sqrdmulh v28.4S, v28.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + mls v24.4S, v8.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sqrdmulh v8.4S, v21.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mul v15.4S, v21.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + sqrdmulh v21.4S, v11.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + mls v15.4S, v8.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + add v8.4S, v27.4S, v18.4S // .........................................................................................................................................*.............................................................................................................................................. + sub v18.4S, v27.4S, v18.4S // ........................................................................................................................................*............................................................................................................................................... + mul v27.4S, v11.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mul v11.4S, v17.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + sqrdmulh v17.4S, v17.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mls v13.4S, v28.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + mls v27.4S, v21.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + sqrdmulh v21.4S, v20.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + add v28.4S, v15.4S, v13.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v13.4S, v15.4S, v13.4S // ..............................................................................................................................*......................................................................................................................................................... + mul v15.4S, v22.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + sqrdmulh v22.4S, v22.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + mls v11.4S, v17.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sub v17.4S, v19.4S, v24.4S // ...................................................................................................................................*.................................................................................................................................................... + add v19.4S, v19.4S, v24.4S // ....................................................................................................................................*................................................................................................................................................... + sqrdmulh v24.4S, v16.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mls v15.4S, v22.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mul v16.4S, v16.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + mls v16.4S, v24.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + mul v24.4S, v20.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + cmge v20.4S, v31.4S, v15.4S // ....................................................................................................................................................................................*................................................................................................... + mls v24.4S, v21.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + cmge v21.4S, v15.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mul v22.4S, v13.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sub v21.4S, v20.4S, v21.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v20.4S, v13.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + sub v13.4S, v27.4S, v24.4S // ...............................................................................................................*........................................................................................................................................................................ + add v27.4S, v27.4S, v24.4S // ................................................................................................................*....................................................................................................................................................................... + sqrdmulh v24.4S, v23.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + mls v15.4S, v21.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v21.4S, v10.4S, v14.4S // ..........................................................................................................*............................................................................................................................................................................. + mls v22.4S, v20.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + sqrdmulh v20.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + str q15, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mul v15.4S, v23.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + mul v23.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + mls v23.4S, v20.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sub count, count, #1 +layer1234_start: + add v10.4S, v10.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ + mls v15.4S, v24.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sub v20.4S, v12.4S, v16.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v16.4S // .............................................................................................................................................................*.......................................................................................................................... + sqrdmulh v14.4S, v17.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + sub v24.4S, v10.4S, v28.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v28.4S // ...................................................................................................................................................*.................................................................................................................................... + mul v17.4S, v17.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + add v13.4S, v11.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v15.4S, v11.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... + mul v28.4S, v21.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + add v11.4S, v27.4S, v19.4S // ........................................................................................................................................................*............................................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + sub v27.4S, v27.4S, v19.4S // .......................................................................................................................................................*................................................................................................................................ + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sqrdmulh v19.4S, v18.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + mul v18.4S, v18.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + mls v28.4S, v21.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mls v17.4S, v14.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mls v18.4S, v19.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + sub v19.4S, v28.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v28.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ + sqrdmulh v21.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mul v10.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + cmge v28.4S, v31.4S, v18.4S // ................................................................................................................................................................................*....................................................................................................... + mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + cmge v8.4S, v18.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + sub v8.4S, v28.4S, v8.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v22.4S, v27.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v10.4S, v21.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mls v18.4S, v8.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v21.4S, v20.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + cmge v28.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + sqrdmulh v20.4S, v20.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + cmge v8.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + str q18, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v18.4S, v15.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sub v28.4S, v28.4S, v8.4S // ..........................................................................................................................................................................................................................................................*............................. + sqrdmulh v15.4S, v15.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mul v8.4S, v19.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v19.4S, v19.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mls v10.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + mls v18.4S, v15.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v15.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v21.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + str q10, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... + mul v10.4S, v27.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sub v27.4S, v28.4S, v15.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v20.4S, v18.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + mls v10.4S, v22.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + cmge v22.4S, v31.4S, v18.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v28.4S, v21.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v16.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v15.4S, v31.4S, v21.4S // ................................................................................................................................................................................................*....................................................................................... + sub v27.4S, v15.4S, v28.4S // ..................................................................................................................................................................................................*..................................................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sqrdmulh v15.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + sub v16.4S, v22.4S, v20.4S // ......................................................................................................................................................................................................*................................................................................. + mls v8.4S, v19.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + mls v18.4S, v16.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sqrdmulh v28.4S, v24.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + str q18, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. + sqrdmulh v18.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + cmge v14.4S, v31.4S, v10.4S // ............................................................................................................................................................................................*........................................................................................... + mul v24.4S, v24.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + mls v21.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + mls v24.4S, v28.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + cmge v28.4S, v10.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v19.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v15.4S, v14.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + str q21, [x1, #752] // ....................................................................................................................................................................................................................*................................................................... + mls v22.4S, v18.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v18.4S, v23.4S, v17.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v14.4S, v31.4S, v8.4S // ........................................................................................................................................................................................................*............................................................................... + mls v10.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + add v15.4S, v23.4S, v17.4S // ............................................................................................................................................................................*........................................................................................................... + sqrdmulh v21.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v9.4S, v8.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + str q10, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + mls v17.4S, v21.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sub v28.4S, v14.4S, v9.4S // ..........................................................................................................................................................................................................*............................................................................. + ldr q9, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... + cmge v20.4S, v24.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + ldr q27, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + cmge v16.4S, v31.4S, v24.4S // ........................................................................................................................................................................................*............................................................................................... + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + cmge v11.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + cmge v10.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v23.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + ldr q14, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + mls v8.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + cmge v28.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v16.4S, v16.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v20.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + sub v11.4S, v11.4S, v10.4S // ..............................................................................................................................................................................................................................................................*......................... + sub v28.4S, v23.4S, v28.4S // ......................................................................................................................................................................................................................................................*................................. + mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v13.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q8, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. + cmge v10.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + mls v17.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sub v28.4S, v27.4S, v9.4S // ................e....................................................................................................................................................................................................................................................................... + mul v15.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v23.4S, v10.4S, v13.4S // ..........................................................................................................................................................................................................................................................................*............. + mls v15.4S, v20.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v8.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + sqrdmulh v10.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + add v20.4S, v27.4S, v9.4S // .................e...................................................................................................................................................................................................................................................................... + ldr q27, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + ldr q9, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + cmge v17.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + mul v12.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v24.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v13.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v12.4S, v10.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + ldr q10, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + sub v16.4S, v9.4S, v27.4S // .....................e.................................................................................................................................................................................................................................................................. + add v9.4S, v9.4S, v27.4S // ......................e................................................................................................................................................................................................................................................................. + sub v13.4S, v13.4S, v8.4S // ......................................................................................................................................................................................................................................................................*................. + mls v22.4S, v23.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + str q24, [x1, #624] // ..................................................................................................................................................................................................................*..................................................................... + cmge v27.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + sqrdmulh v24.4S, v18.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + cmge v8.4S, v31.4S, v12.4S // ................................................................................................................................................................................................................................................................*....................... + mls v19.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v23.4S, v12.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + sub v17.4S, v17.4S, v27.4S // ..............................................................................................................................................................................................................................................................................*......... + sub v11.4S, v10.4S, v14.4S // .........................................e.............................................................................................................................................................................................................................................. + mul v27.4S, v18.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sub v8.4S, v8.4S, v23.4S // ..................................................................................................................................................................................................................................................................*..................... + ldr q23, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + mls v27.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + mls v12.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + add v18.4S, v10.4S, v14.4S // ..........................................e............................................................................................................................................................................................................................................. + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + ldr q19, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + cmge v10.4S, v31.4S, v27.4S // ............................................................................................................................................................................................................*........................................................................... + mls v15.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + cmge v17.4S, v27.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + ldr q22, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + sub v24.4S, v20.4S, v9.4S // ........................................................e............................................................................................................................................................................................................................... + mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + ldr q13, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + add v14.4S, v20.4S, v9.4S // .........................................................e.............................................................................................................................................................................................................................. + str q12, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + sub v12.4S, v10.4S, v17.4S // ..............................................................................................................................................................................................................*......................................................................... + sqrdmulh v8.4S, v11.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + add v10.4S, v19.4S, v22.4S // ................................e....................................................................................................................................................................................................................................................... + sub v17.4S, v19.4S, v22.4S // ...............................e........................................................................................................................................................................................................................................................ + str q15, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + ldr q22, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... + mls v27.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + mul v19.4S, v11.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + sub v15.4S, v22.4S, v23.4S // ..........................e............................................................................................................................................................................................................................................................. + mul v12.4S, v28.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + str q27, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + sqrdmulh v28.4S, v28.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v16.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + mul v27.4S, v16.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + mls v12.4S, v28.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + add v11.4S, v22.4S, v23.4S // ...........................e............................................................................................................................................................................................................................................................ + ldr q16, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + mls v27.4S, v9.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + add v22.4S, v13.4S, v16.4S // .....................................e.................................................................................................................................................................................................................................................. + mls v19.4S, v8.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + sub v28.4S, v13.4S, v16.4S // ....................................e................................................................................................................................................................................................................................................... + sqrdmulh v8.4S, v17.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + sub v23.4S, v11.4S, v10.4S // ..................................................................e..................................................................................................................................................................................................................... + add v20.4S, v12.4S, v27.4S // ..............................................................e......................................................................................................................................................................................................................... + sub v12.4S, v12.4S, v27.4S // .............................................................e.......................................................................................................................................................................................................................... + sqrdmulh v21.4S, v15.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + mul v9.4S, v15.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + add v10.4S, v11.4S, v10.4S // ...................................................................e.................................................................................................................................................................................................................... + sqrdmulh v15.4S, v28.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + add v11.4S, v14.4S, v10.4S // .................................................................................................e...................................................................................................................................................................................... + sub v14.4S, v14.4S, v10.4S // ................................................................................................e....................................................................................................................................................................................... + mul v10.4S, v17.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + mls v9.4S, v21.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + mls v10.4S, v8.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + mul v17.4S, v28.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + sqrdmulh v13.4S, v12.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + add v28.4S, v22.4S, v18.4S // .............................................................................e.......................................................................................................................................................................................................... + mls v17.4S, v15.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + sub v8.4S, v9.4S, v10.4S // .......................................................................e................................................................................................................................................................................................................ + mul v15.4S, v12.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sqrdmulh v21.4S, v23.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + sqrdmulh v16.4S, v14.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v12.4S, v14.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + sub v22.4S, v22.4S, v18.4S // ............................................................................e........................................................................................................................................................................................................... + mul v14.4S, v23.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + add v23.4S, v9.4S, v10.4S // ........................................................................e............................................................................................................................................................................................................... + ldr q10, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + ldr q9, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + mls v14.4S, v21.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + ldr q21, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + mls v12.4S, v16.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + mls v15.4S, v13.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + sub v27.4S, v21.4S, v9.4S // ..............................................e......................................................................................................................................................................................................................................... + sqrdmulh v16.4S, v8.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + mul v13.4S, v8.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + sqrdmulh v18.4S, v27.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + ldr q8, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + mls v13.4S, v16.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + add v16.4S, v21.4S, v9.4S // ...............................................e........................................................................................................................................................................................................................................ + mul v27.4S, v27.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + sub v21.4S, v8.4S, v10.4S // ...................................................e.................................................................................................................................................................................................................................... + add v8.4S, v8.4S, v10.4S // ....................................................e................................................................................................................................................................................................................................... + mls v27.4S, v18.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + add v18.4S, v16.4S, v8.4S // .......................................................................................e................................................................................................................................................................................................ + sqrdmulh v9.4S, v22.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + sub v10.4S, v16.4S, v8.4S // ......................................................................................e................................................................................................................................................................................................. + mul v8.4S, v22.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + sub v16.4S, v28.4S, v18.4S // ....................................................................................................................e................................................................................................................................................................... + add v18.4S, v28.4S, v18.4S // .....................................................................................................................e.................................................................................................................................................................. + sqrdmulh v28.4S, v21.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mul v21.4S, v21.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sub v22.4S, v20.4S, v23.4S // .....................................................................................................e.................................................................................................................................................................................. + add v20.4S, v20.4S, v23.4S // ......................................................................................................e................................................................................................................................................................................. + add v23.4S, v17.4S, v19.4S // ..................................................................................e..................................................................................................................................................................................................... + mls v21.4S, v28.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + sub v17.4S, v17.4S, v19.4S // .................................................................................e...................................................................................................................................................................................................... + sqrdmulh v19.4S, v10.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + mls v8.4S, v9.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + add v28.4S, v27.4S, v21.4S // ............................................................................................e........................................................................................................................................................................................... + sub v27.4S, v27.4S, v21.4S // ...........................................................................................e............................................................................................................................................................................................ + mul v10.4S, v10.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + sub v21.4S, v15.4S, v13.4S // ...............................................................................................................e........................................................................................................................................................................ + mls v10.4S, v19.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + add v19.4S, v23.4S, v28.4S // ..........................................................................................................................e............................................................................................................................................................. + sub v23.4S, v23.4S, v28.4S // .........................................................................................................................e.............................................................................................................................................................. + sqrdmulh v28.4S, v27.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + add v9.4S, v20.4S, v19.4S // ..............................................................................................................................................e......................................................................................................................................... + sub v19.4S, v20.4S, v19.4S // .............................................................................................................................................e.......................................................................................................................................... + mul v20.4S, v27.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + add v27.4S, v15.4S, v13.4S // ................................................................................................................e....................................................................................................................................................................... + sqrdmulh v15.4S, v16.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mul v16.4S, v16.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + sub v13.4S, v8.4S, v10.4S // ..............................................................................................................................e......................................................................................................................................................... + mls v20.4S, v28.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + add v28.4S, v8.4S, v10.4S // ...............................................................................................................................e........................................................................................................................................................ + add v8.4S, v11.4S, v18.4S // .........................................................................................................................................e.............................................................................................................................................. + sub v18.4S, v11.4S, v18.4S // ........................................................................................................................................e............................................................................................................................................... + mul v10.4S, v24.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + mul v11.4S, v19.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + sqrdmulh v19.4S, v19.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + mls v16.4S, v15.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + sqrdmulh v15.4S, v24.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mls v11.4S, v19.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... + sqrdmulh v19.4S, v17.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + mls v10.4S, v15.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + cmge v24.4S, v31.4S, v11.4S // ....................................................................................................................................................................................e................................................................................................... + mul v15.4S, v17.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + cmge v17.4S, v11.4S, v30.4S // .....................................................................................................................................................................................e.................................................................................................. + mls v15.4S, v19.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + sub v17.4S, v24.4S, v17.4S // ......................................................................................................................................................................................e................................................................................................. + sqrdmulh v24.4S, v22.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + mls v11.4S, v17.4S, v29.4S // .......................................................................................................................................................................................e................................................................................................ + add v19.4S, v15.4S, v20.4S // ....................................................................................................................................e................................................................................................................................................... + sub v17.4S, v15.4S, v20.4S // ...................................................................................................................................e.................................................................................................................................................... + mul v15.4S, v23.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + sqrdmulh v20.4S, v21.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... + str q11, [x1, #576] // .................................................................................................................................................................................................................e...................................................................... + mul v11.4S, v22.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + mul v22.4S, v13.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + sqrdmulh v13.4S, v13.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... + mls v11.4S, v24.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + sqrdmulh v24.4S, v23.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mul v23.4S, v21.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + mls v23.4S, v20.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... + mls v22.4S, v13.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... + sub v21.4S, v10.4S, v14.4S // ..........................................................................................................e............................................................................................................................................................................. + + // original source code + // ldr q8, [x1, #0] // ..e...............................................................................................................................................................................................|.......................................................................................e............................................................................ + // ldr q9, [x1, #(1*(512/8))] // e.................................................................................................................................................................................................|.....................................................................................e.............................................................................. + // ldr q10, [x1, #(2*(512/8))] // .............................e....................................................................................................................................................................|..................................................................................................................e................................................. + // ldr q11, [x1, #(3*(512/8))] // ............................e.....................................................................................................................................................................|.................................................................................................................e.................................................. + // ldr q12, [x1, #(4*(512/8))] // ........................................................................e.........................................................................................................................|.............................................................................................................................................................e...... + // ldr q13, [x1, #(5*(512/8))] // ....................................................e.............................................................................................................................................|.........................................................................................................................................e.......................... + // ldr q14, [x1, #(6*(512/8))] // .........................................................e........................................................................................................................................|..............................................................................................................................................e..................... + // ldr q15, [x1, #(7*(512/8))] // .............................................................e....................................................................................................................................|..................................................................................................................................................e................. + // ldr q16, [x1, #(8*(512/8))] // ................................................................e.................................................................................................................................|.....................................................................................................................................................e.............. + // ldr q17, [x1, #(9*(512/8))] // ....................................................................................e.............................................................................................................|.................................................................................................................................................................... + // ldr q18, [x1, #(10*(512/8))] // ....................................e.............................................................................................................................................................|.........................................................................................................................e.......................................... + // ldr q19, [x1, #(11*(512/8))] // .........e........................................................................................................................................................................................|..............................................................................................e..................................................................... + // ldr q20, [x1, #(12*(512/8))] // .....................................................................................................................e............................................................................|.................................................................................................................................................................... + // ldr q21, [x1, #(13*(512/8))] // ...................................................................................................................e..............................................................................|.................................................................................................................................................................... + // ldr q22, [x1, #(14*(512/8))] // ............................................................................................................................e.....................................................................|.................................................................................................................................................................... + // ldr q23, [x1, #(15*(512/8))] // ..................................................................................................................e...............................................................................|.................................................................................................................................................................... + // sub v24.4s, v8.4s, v9.4s // .....................e............................................................................................................................................................................|..........................................................................................................e......................................................... + // add v8.4s, v8.4s, v9.4s // ...........................e......................................................................................................................................................................|................................................................................................................e................................................... + // mul v9.4s, v24.4s, v3.s[2] // .............................................................................e....................................................................................................................|..................................................................................................................................................................e. + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ...............................................................................e..................................................................................................................|.................................................................................................................................................................... + // mls v9.4s, v24.4s, v29.4s // ..................................................................................e...............................................................................................................|.................................................................................................................................................................... + // sub v24.4s, v10.4s, v11.4s // .....................................e............................................................................................................................................................|..........................................................................................................................e......................................... + // add v10.4s, v10.4s, v11.4s // ......................................e...........................................................................................................................................................|...........................................................................................................................e........................................ + // mul v11.4s, v24.4s, v4.s[0] // .................................................................................e................................................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ................................................................................e.................................................................................................................|.................................................................................................................................................................... + // mls v11.4s, v24.4s, v29.4s // .....................................................................................e............................................................................................................|.................................................................................................................................................................... + // sub v24.4s, v12.4s, v13.4s // ............................................................................e.....................................................................................................................|.................................................................................................................................................................e.. + // add v12.4s, v12.4s, v13.4s // ...................................................................................e..............................................................................................................|.................................................................................................................................................................... + // mul v13.4s, v24.4s, v4.s[2] // ..............................................................................................e...................................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.s[3] // .............................................................................................e....................................................................................................|.................................................................................................................................................................... + // mls v13.4s, v24.4s, v29.4s // ....................................................................................................e.............................................................................................|.................................................................................................................................................................... + // sub v24.4s, v14.4s, v15.4s // ......................................................................e...........................................................................................................................|...........................................................................................................................................................e........ + // add v14.4s, v14.4s, v15.4s // .....................................................................e............................................................................................................................|..........................................................................................................................................................e......... + // mul v15.4s, v24.4s, v5.s[0] // ...................................................................................................e..............................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.s[1] // .........................................................................................e........................................................................................................|.................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // .....................................................................................................e............................................................................................|.................................................................................................................................................................... + // sub v24.4s, v16.4s, v17.4s // ........................................................................................e.........................................................................................................|.................................................................................................................................................................... + // add v16.4s, v16.4s, v17.4s // ......................................................................................e...........................................................................................................|.................................................................................................................................................................... + // mul v17.4s, v24.4s, v5.s[2] // ......................................................................................................e...........................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ................................................................................................e.................................................................................................|.................................................................................................................................................................... + // mls v17.4s, v24.4s, v29.4s // .........................................................................................................e........................................................................................|.................................................................................................................................................................... + // sub v24.4s, v18.4s, v19.4s // .................................................e................................................................................................................................................|......................................................................................................................................e............................. + // add v18.4s, v18.4s, v19.4s // .......................................................e..........................................................................................................................................|............................................................................................................................................e....................... + // mul v19.4s, v24.4s, v6.s[0] // ...........................................................................e......................................................................................................................|................................................................................................................................................................e... + // sqrdmulh v24.4s, v24.4s, v6.s[1] // ....................................................................e.............................................................................................................................|.........................................................................................................................................................e.......... + // mls v19.4s, v24.4s, v29.4s // .......................................................................................e..........................................................................................................|.................................................................................................................................................................... + // sub v24.4s, v20.4s, v21.4s // ........................................................................................................................e.........................................................................|.................................................................................................................................................................... + // add v20.4s, v20.4s, v21.4s // ..............................................................................................................................e...................................................................|.................................................................................................................................................................... + // mul v21.4s, v24.4s, v6.s[2] // ...............................................................................................................................e..................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ...........................................................................................................................e......................................................................|.................................................................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // ..................................................................................................................................e...............................................................|.................................................................................................................................................................... + // sub v24.4s, v22.4s, v23.4s // ................................................................................................................................e.................................................................|.................................................................................................................................................................... + // add v22.4s, v22.4s, v23.4s // .................................................................................................................................e................................................................|.................................................................................................................................................................... + // mul v23.4s, v24.4s, v7.s[0] // ..........................................................................................................................................e.......................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v7.s[1] // .........................................................................................................................................e........................................................|.................................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // ..............................................................................................................................................e...................................................|.................................................................................................................................................................... + // sub v24.4s, v8.4s, v10.4s // ..............................................................e...................................................................................................................................|...................................................................................................................................................e................ + // add v8.4s, v8.4s, v10.4s // .................................................................e................................................................................................................................|......................................................................................................................................................e............. + // mul v10.4s, v24.4s, v1.s[2] // .....................................................................................................................................................................e............................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .........................................................................................................................................................................e........................|.................................................................................................................................................................... + // mls v10.4s, v24.4s, v29.4s // ............................................................................................................................................................................e.....................|.................................................................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ............................................................................................e.....................................................................................................|.................................................................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...........................................................................................e......................................................................................................|.................................................................................................................................................................... + // mul v11.4s, v24.4s, v1.s[2] // ...........................................................................................................e......................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .......................................................................................................e..........................................................................................|.................................................................................................................................................................... + // mls v11.4s, v24.4s, v29.4s // .......................................................................................................................e..........................................................................|.................................................................................................................................................................... + // sub v24.4s, v12.4s, v14.4s // ..........................................................................................e.......................................................................................................|.................................................................................................................................................................... + // add v12.4s, v12.4s, v14.4s // ...............................................................................................e..................................................................................................|.................................................................................................................................................................... + // mul v14.4s, v24.4s, v2.s[0] // ................................................................................................................e.................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ............................................................................................................e.....................................................................................|.................................................................................................................................................................... + // mls v14.4s, v24.4s, v29.4s // ....................................................................................................................e.............................................................................|.................................................................................................................................................................... + // sub v24.4s, v13.4s, v15.4s // ..........................................................................................................e.......................................................................................|.................................................................................................................................................................... + // add v13.4s, v13.4s, v15.4s // .................................................................................................................e................................................................................|.................................................................................................................................................................... + // mul v15.4s, v24.4s, v2.s[0] // ..........................................................................................................................e.......................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .........................................................................................................................e........................................................................|.................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // .............................................................................................................................e....................................................................|.................................................................................................................................................................... + // sub v24.4s, v16.4s, v18.4s // ...............................................................................................................e..................................................................................|.................................................................................................................................................................... + // add v16.4s, v16.4s, v18.4s // ........................................................................................................e.........................................................................................|.................................................................................................................................................................... + // mul v18.4s, v24.4s, v2.s[2] // ......................................................................................................................................e...........................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ....................................................................................................................................e.............................................................|.................................................................................................................................................................... + // mls v18.4s, v24.4s, v29.4s // .................................................................................................................................................e................................................|.................................................................................................................................................................... + // sub v24.4s, v17.4s, v19.4s // ...............................................................................................................................................e..................................................|.................................................................................................................................................................... + // add v17.4s, v17.4s, v19.4s // .............................................................................................................................................e....................................................|.................................................................................................................................................................... + // mul v19.4s, v24.4s, v2.s[2] // ..............................................................................................................................................................................e...................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...........................................................................................................................................................................e......................|.................................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ................................................................................................................................................................................e.................|.................................................................................................................................................................... + // sub v24.4s, v20.4s, v22.4s // .....................................................................................................................................e............................................................|.................................................................................................................................................................... + // add v20.4s, v20.4s, v22.4s // ...................................................................................................................................e..............................................................|.................................................................................................................................................................... + // mul v22.4s, v24.4s, v3.s[0] // ....................................................................................................................................................e.............................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................................................................e.................................................|.................................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ......................................................................................................................................................e...........................................|.................................................................................................................................................................... + // sub v24.4s, v21.4s, v23.4s // ...................................................................................................................................................e..............................................|.................................................................................................................................................................... + // add v21.4s, v21.4s, v23.4s // ..................................................................................................................................................e...............................................|.................................................................................................................................................................... + // mul v23.4s, v24.4s, v3.s[0] // ............................................................................................................................................................e.....................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................................................................................................................e........................................|.................................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // .................................................................................................................................................................e................................|.................................................................................................................................................................... + // sub v24.4s, v8.4s, v12.4s // ..................................................................................................e...............................................................................................|.................................................................................................................................................................... + // add v8.4s, v8.4s, v12.4s // .................................................................................................e................................................................................................|.................................................................................................................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ..............................................................................................................e...................................................................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................e....................................................................................|.................................................................................................................................................................... + // mls v12.4s, v24.4s, v29.4s // ......................................................................................................................e...........................................................................|.................................................................................................................................................................... + // sub v24.4s, v9.4s, v13.4s // ...........................................................................................................................................e......................................................|.................................................................................................................................................................... + // add v9.4s, v9.4s, v13.4s // ............................................................................................................................................e.....................................................|.................................................................................................................................................................... + // mul v13.4s, v24.4s, v0.s[2] // .........................................................................................................................................................................................e........|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................................................................e...............|.................................................................................................................................................................... + // mls v13.4s, v24.4s, v29.4s // ............................................................................................................................................................................................e.....|.................................................................................................................................................................... + // sub v24.4s, v10.4s, v14.4s // .................................................................................................................................................................................................e|.................................................................................................................................................................... + // add v10.4s, v10.4s, v14.4s // ..................................................................................................................................................................................................*.................................................................................................................................................................... + // mul v14.4s, v24.4s, v0.s[2] // ..................................................................................................................................................................................................|.........*.......................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................................................................................|...........*........................................................................................................................................................ + // mls v14.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.................*.................................................................................................................................................. + // sub v24.4s, v11.4s, v15.4s // .....................................................................................................................................................e............................................|.................................................................................................................................................................... + // add v11.4s, v11.4s, v15.4s // .............................................................................................................................................................e....................................|.................................................................................................................................................................... + // mul v15.4s, v24.4s, v0.s[2] // ..............................................................................................................................................................................................e...|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................................................e..........|.................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // ...............................................................................................................................................................................................e..|.................................................................................................................................................................... + // sub v24.4s, v16.4s, v20.4s // .......................................................................................................................................e..........................................................|.................................................................................................................................................................... + // add v16.4s, v16.4s, v20.4s // ........................................................................................................................................e.........................................................|.................................................................................................................................................................... + // mul v20.4s, v24.4s, v1.s[0] // ...............................................................................................................................................................e..................................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................e...................................|.................................................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // ........................................................................................................................................................................e.........................|.................................................................................................................................................................... + // sub v24.4s, v17.4s, v21.4s // ........................................................................................................................................................e.........................................|.................................................................................................................................................................... + // add v17.4s, v17.4s, v21.4s // .......................................................................................................................................................e..........................................|.................................................................................................................................................................... + // mul v21.4s, v24.4s, v1.s[0] // ......................................................................................................................................................................................e...........|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................................................................................................................................................................e....|.................................................................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|*................................................................................................................................................................... + // sub v24.4s, v18.4s, v22.4s // ................................................................................................................................................................e.................................|.................................................................................................................................................................... + // add v18.4s, v18.4s, v22.4s // ..................................................................................................................................................................e...............................|.................................................................................................................................................................... + // mul v22.4s, v24.4s, v1.s[0] // ..........................................................................................................................................................................................e.......|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................................................................................................................................................................e......|.................................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ................................................................................................................................................................................................e.|.................................................................................................................................................................... + // sub v24.4s, v19.4s, v23.4s // .....................................................................................................................................................................................e............|.................................................................................................................................................................... + // add v19.4s, v19.4s, v23.4s // ....................................................................................................................................................................................e.............|.................................................................................................................................................................... + // mul v23.4s, v24.4s, v1.s[0] // ..................................................................................................................................................................................................|......*............................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..................................................................................................................................................................................................|...*................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|..................*................................................................................................................................................. + // sub v24.4s, v8.4s, v16.4s // ....................................................................................................................................................................e.............................|.................................................................................................................................................................... + // add v8.4s, v8.4s, v16.4s // ...................................................................................................................................................................e..............................|.................................................................................................................................................................... + // mul v16.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|................*................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|...............*.................................................................................................................................................... + // mls v16.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...................*................................................................................................................................................ + // sub v24.4s, v9.4s, v17.4s // ...........................................................................................................................................................e......................................|.................................................................................................................................................................... + // add v9.4s, v9.4s, v17.4s // ..........................................................................................................................................................e.......................................|.................................................................................................................................................................... + // mul v17.4s, v24.4s, v0.s[0] // ......................................................................................................................................................................e...........................|.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................e..........................|.................................................................................................................................................................... + // mls v17.4s, v24.4s, v29.4s // ..........................................................................................................................................................................e.......................|.................................................................................................................................................................... + // sub v24.4s, v10.4s, v18.4s // ..................................................................................................................................................................................................|....*............................................................................................................................................................... + // add v10.4s, v10.4s, v18.4s // ..................................................................................................................................................................................................|.....*.............................................................................................................................................................. + // mul v18.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...................................................................*................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|..............................................................*..................................................................................................... + // mls v18.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.....................................................................*.............................................................................................. + // sub v24.4s, v11.4s, v19.4s // ..................................................................................................................................................................................................|............*....................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ..................................................................................................................................................................................................|..........*......................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...............................................*.................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|............................*....................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|..................................................*................................................................................................................. + // sub v24.4s, v12.4s, v20.4s // ..................................................................................................................................................................................................|.*.................................................................................................................................................................. + // add v12.4s, v12.4s, v20.4s // ..................................................................................................................................................................................................|..*................................................................................................................................................................. + // mul v20.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|...............................*.................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|.................................*.................................................................................................................................. + // mls v20.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|.............................................*...................................................................................................................... + // sub v24.4s, v13.4s, v21.4s // ..................................................................................................................................................................................................|........*........................................................................................................................................................... + // add v13.4s, v13.4s, v21.4s // ..................................................................................................................................................................................................|.......*............................................................................................................................................................ + // mul v21.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|....................................*............................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|......................................*............................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...........................................*........................................................................................................................ + // sub v24.4s, v14.4s, v22.4s // ..................................................................................................................................................................................................|....................*............................................................................................................................................... + // add v14.4s, v14.4s, v22.4s // ..................................................................................................................................................................................................|.....................*.............................................................................................................................................. + // mul v22.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................|.......................................*............................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................................................................|........................................*........................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................|...........................................................*........................................................................................................ + // sub v24.4s, v15.4s, v23.4s // ..................................................................................................................................................................................................|...........................................................................*........................................................................................ + // add v15.4s, v15.4s, v23.4s // ..................................................................................................................................................................................................|..............................................................................*..................................................................................... + // mul v23.4s, v24.4s, v0.s[0] // ..................................................*...............................................................................................................................................|.......................................................................................................................................*............................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................*......................................................................................................................................................|................................................................................................................................*................................... + // mls v23.4s, v24.4s, v29.4s // .....................................................*............................................................................................................................................|..........................................................................................................................................*......................... + // cmge v27.4s, v31.4s, v16.4s // ..................................................................................................................................................................................................|........................*........................................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ..................................................................................................................................................................................................|..........................*......................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|...........................*........................................................................................................................................ + // mls v16.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|..............................*..................................................................................................................................... + // cmge v27.4s, v31.4s, v17.4s // .............................................................................................................................................................................e....................|.................................................................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ...............................................................................................................................................................................e..................|.................................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................e................|.................................................................................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ...................................................................................................................................................................................e..............|.................................................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ...*..............................................................................................................................................................................................|........................................................................................*........................................................................... + // cmge v28.4s, v18.4s, v30.4s // .*................................................................................................................................................................................................|......................................................................................*............................................................................. + // sub v28.4s, v27.4s, v28.4s // ............*.....................................................................................................................................................................................|.................................................................................................*.................................................................. + // mls v18.4s, v28.4s, v29.4s // .................................*................................................................................................................................................................|......................................................................................................................*............................................. + // cmge v27.4s, v31.4s, v19.4s // ..................................................................................................................................................................................................|..................................................................*................................................................................................. + // cmge v28.4s, v19.4s, v30.4s // ..................................................................................................................................................................................................|......................................................................*............................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|........................................................................*........................................................................................... + // mls v19.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.............................................................................*...................................................................................... + // cmge v27.4s, v31.4s, v20.4s // ..................................................................................................................................................................................................|......................................................*............................................................................................................. + // cmge v28.4s, v20.4s, v30.4s // ..................................................................................................................................................................................................|....................................................*............................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.......................................................*............................................................................................................ + // mls v20.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|....................................................................*............................................................................................... + // cmge v27.4s, v31.4s, v21.4s // ..................................................................................................................................................................................................|...................................................*................................................................................................................ + // cmge v28.4s, v21.4s, v30.4s // ..................................................................................................................................................................................................|.................................................*.................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|..........................................................*......................................................................................................... + // mls v21.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.............................................................*...................................................................................................... + // cmge v27.4s, v31.4s, v22.4s // ..................................................................................................................................................................................................|............................................................................*....................................................................................... + // cmge v28.4s, v22.4s, v30.4s // ..................................................................................................................................................................................................|.................................................................................*.................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|....................................................................................*............................................................................... + // mls v22.4s, v28.4s, v29.4s // ..........*.......................................................................................................................................................................................|...............................................................................................*.................................................................... + // cmge v27.4s, v31.4s, v23.4s // ..........................................................*.......................................................................................................................................|...............................................................................................................................................*.................... + // cmge v28.4s, v23.4s, v30.4s // ............................................................*.....................................................................................................................................|.................................................................................................................................................*.................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................*..............................................................................................................................|........................................................................................................................................................*........... + // mls v23.4s, v28.4s, v29.4s // .........................................................................*........................................................................................................................|..............................................................................................................................................................*..... + // str q16, [x1, #(8*(512/8))] // ..................................................................................................................................................................................................|...................................*................................................................................................................................ + // str q17, [x1, #(9*(512/8))] // ........................................................................................................................................................................................e.........|.................................................................................................................................................................... + // str q18, [x1, #(10*(512/8))] // .........................................*........................................................................................................................................................|..............................................................................................................................*..................................... + // str q19, [x1, #(11*(512/8))] // ..................................................................................................................................................................................................|..................................................................................*................................................................................. + // str q20, [x1, #(12*(512/8))] // ..................................................................................................................................................................................................|.........................................................................*.......................................................................................... + // str q21, [x1, #(13*(512/8))] // ..................................................................................................................................................................................................|................................................................*................................................................................................... + // str q22, [x1, #(14*(512/8))] // ..................*...............................................................................................................................................................................|.......................................................................................................*............................................................ + // str q23, [x1, #(15*(512/8))] // ..............................................................................*...................................................................................................................|...................................................................................................................................................................* + // mul v16.4s, v8.4s, v25.4s // ..................................................................................................................................................................................................|.............*...................................................................................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // ..................................................................................................................................................................................................|..............*..................................................................................................................................................... + // mls v16.4s, v8.4s, v29.4s // ..................................................................................................................................................................................................|.........................*.......................................................................................................................................... + // mul v17.4s, v9.4s, v25.4s // ..................................................................................................................................................................................................|................................................................................*................................................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ..................................................................................................................................................................................................|...............................................................................*.................................................................................... + // mls v17.4s, v9.4s, v29.4s // ..................................................................................................................................................................................................|...................................................................................*................................................................................ + // mul v18.4s, v10.4s, v25.4s // ..................................................................................................................................................................................................|.......................*............................................................................................................................................ + // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................................................................................................................................|......................*............................................................................................................................................. + // mls v18.4s, v10.4s, v29.4s // ..................................................................................................................................................................................................|.............................*...................................................................................................................................... + // mul v19.4s, v11.4s, v25.4s // ..................................................................................................................................................................................................|............................................................*....................................................................................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ..................................................................................................................................................................................................|.........................................................*.......................................................................................................... + // mls v19.4s, v11.4s, v29.4s // ..................................................................................................................................................................................................|.......................................................................*............................................................................................ + // mul v20.4s, v12.4s, v25.4s // ................................*.................................................................................................................................................................|.....................................................................................................................*.............................................. + // sqrdmulh v12.4s, v12.4s, v26.4s // ..........................*.......................................................................................................................................................................|...............................................................................................................*.................................................... + // mls v20.4s, v12.4s, v29.4s // ...................................*..............................................................................................................................................................|........................................................................................................................*........................................... + // mul v21.4s, v13.4s, v25.4s // ....*.............................................................................................................................................................................................|.........................................................................................*.......................................................................... + // sqrdmulh v13.4s, v13.4s, v26.4s // ......*...........................................................................................................................................................................................|...........................................................................................*........................................................................ + // mls v21.4s, v13.4s, v29.4s // ................*.................................................................................................................................................................................|.....................................................................................................*.............................................................. + // mul v22.4s, v14.4s, v25.4s // ..................................................................................................................................................................................................|...............................................................*.................................................................................................... + // sqrdmulh v14.4s, v14.4s, v26.4s // ..................................................................................................................................................................................................|.................................................................*.................................................................................................. + // mls v22.4s, v14.4s, v29.4s // ..................................................................................................................................................................................................|..........................................................................*......................................................................................... + // mul v23.4s, v15.4s, v25.4s // ......................*...........................................................................................................................................................................|...........................................................................................................*........................................................ + // sqrdmulh v15.4s, v15.4s, v26.4s // .............*....................................................................................................................................................................................|..................................................................................................*................................................................. + // mls v23.4s, v15.4s, v29.4s // ........................*.........................................................................................................................................................................|.............................................................................................................*...................................................... + // cmge v27.4s, v31.4s, v16.4s // ..................................................................................................................................................................................................|..........................................*......................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ..................................................................................................................................................................................................|............................................*....................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|................................................*................................................................................................................... + // mls v16.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.....................................................*.............................................................................................................. + // cmge v27.4s, v31.4s, v17.4s // ........*.........................................................................................................................................................................................|.............................................................................................*...................................................................... + // cmge v28.4s, v17.4s, v30.4s // ...........*......................................................................................................................................................................................|................................................................................................*................................................................... + // sub v28.4s, v27.4s, v28.4s // ...............*..................................................................................................................................................................................|....................................................................................................*............................................................... + // mls v17.4s, v28.4s, v29.4s // ....................*.............................................................................................................................................................................|.........................................................................................................*.......................................................... + // cmge v27.4s, v31.4s, v18.4s // ..................................................................................................................................................................................................|................................*................................................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ..................................................................................................................................................................................................|..................................*................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................................................................................................................................................|.....................................*.............................................................................................................................. + // mls v18.4s, v28.4s, v29.4s // ..................................................................................................................................................................................................|.........................................*.......................................................................................................................... + // cmge v27.4s, v31.4s, v19.4s // .....*............................................................................................................................................................................................|..........................................................................................*......................................................................... + // cmge v28.4s, v19.4s, v30.4s // .......*..........................................................................................................................................................................................|............................................................................................*....................................................................... + // sub v28.4s, v27.4s, v28.4s // ..............*...................................................................................................................................................................................|...................................................................................................*................................................................ + // mls v19.4s, v28.4s, v29.4s // .............................................*....................................................................................................................................................|..................................................................................................................................*................................. + // cmge v27.4s, v31.4s, v20.4s // ............................................*.....................................................................................................................................................|.................................................................................................................................*.................................. + // cmge v28.4s, v20.4s, v30.4s // ..............................................*...................................................................................................................................................|...................................................................................................................................*................................ + // sub v28.4s, v27.4s, v28.4s // ...................................................*..............................................................................................................................................|........................................................................................................................................*........................... + // mls v20.4s, v28.4s, v29.4s // ......................................................*...........................................................................................................................................|...........................................................................................................................................*........................ + // cmge v27.4s, v31.4s, v21.4s // ..................................*...............................................................................................................................................................|.......................................................................................................................*............................................ + // cmge v28.4s, v21.4s, v30.4s // .........................*........................................................................................................................................................................|..............................................................................................................*..................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................*..........................................................................................................................................................|............................................................................................................................*....................................... + // mls v21.4s, v28.4s, v29.4s // ...............................................................*..................................................................................................................................|....................................................................................................................................................*............... + // cmge v27.4s, v31.4s, v22.4s // ...................*..............................................................................................................................................................................|........................................................................................................*........................................................... + // cmge v28.4s, v22.4s, v30.4s // .................*................................................................................................................................................................................|......................................................................................................*............................................................. + // sub v28.4s, v27.4s, v28.4s // .......................*..........................................................................................................................................................................|............................................................................................................*....................................................... + // mls v22.4s, v28.4s, v29.4s // ........................................*.........................................................................................................................................................|.............................................................................................................................*...................................... + // cmge v27.4s, v31.4s, v23.4s // ...............................*..................................................................................................................................................................|....................................................................................................................*............................................... + // cmge v28.4s, v23.4s, v30.4s // ..........................................*.......................................................................................................................................................|...............................................................................................................................*.................................... + // sub v28.4s, v27.4s, v28.4s // ................................................*.................................................................................................................................................|.....................................................................................................................................*.............................. + // mls v23.4s, v28.4s, v29.4s // ...........................................................*......................................................................................................................................|................................................................................................................................................*................... + // str q16, [x1], #(16) // ..................................................................................................................................................................................................|........................................................*........................................................................................................... + // str q17, [x1, #(-16 + 1*(512/8))] // ..............................*...................................................................................................................................................................|...................................................................................................................*................................................ + // str q18, [x1, #(-16 + 2*(512/8))] // ..................................................................................................................................................................................................|..............................................*..................................................................................................................... + // str q19, [x1, #(-16 + 3*(512/8))] // ........................................................*.........................................................................................................................................|.............................................................................................................................................*...................... + // str q20, [x1, #(-16 + 4*(512/8))] // ..................................................................*...............................................................................................................................|.......................................................................................................................................................*............ + // str q21, [x1, #(-16 + 5*(512/8))] // ..........................................................................*.......................................................................................................................|...............................................................................................................................................................*.... + // str q22, [x1, #(-16 + 6*(512/8))] // ...............................................*..................................................................................................................................................|....................................................................................................................................*............................... + // str q23, [x1, #(-16 + 7*(512/8))] // .......................................................................*..........................................................................................................................|............................................................................................................................................................*....... + + sub count, count, #1 + cbnz count, layer1234_start + mls v15.4S, v24.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + add v24.4S, v10.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v20.4S, v27.4S, v19.4S // .......................................................................................................................................................*................................................................................................................................ + sub v10.4S, v12.4S, v16.4S // ............................................................................................................................................................*........................................................................................................................... + sqrdmulh v14.4S, v18.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + add v12.4S, v12.4S, v16.4S // .............................................................................................................................................................*.......................................................................................................................... + mul v16.4S, v18.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sub v18.4S, v11.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v11.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... + sqrdmulh v15.4S, v21.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + add v11.4S, v27.4S, v19.4S // ........................................................................................................................................................*............................................................................................................................... + mul v19.4S, v21.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + mul v27.4S, v20.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + mls v19.4S, v15.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mls v16.4S, v14.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + sqrdmulh v15.4S, v10.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + sub v21.4S, v19.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v19.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ + sqrdmulh v22.4S, v20.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + cmge v19.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + mul v20.4S, v10.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + cmge v10.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v20.4S, v15.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sub v15.4S, v24.4S, v28.4S // ..................................................................................................................................................*..................................................................................................................................... + sub v10.4S, v19.4S, v10.4S // ..................................................................................................................................................................................*..................................................................................................... + mul v19.4S, v17.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mls v16.4S, v10.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + add v10.4S, v24.4S, v28.4S // ...................................................................................................................................................*.................................................................................................................................... + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sqrdmulh v24.4S, v17.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + cmge v16.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + cmge v17.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... + mls v27.4S, v22.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sub v22.4S, v17.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... + sqrdmulh v17.4S, v18.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mul v28.4S, v18.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + cmge v18.4S, v31.4S, v27.4S // ............................................................................................................................................................................................*........................................................................................... + mls v20.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v22.4S, v27.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v19.4S, v24.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + sub v22.4S, v18.4S, v22.4S // ..............................................................................................................................................................................................*......................................................................................... + sqrdmulh v24.4S, v15.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + mul v20.4S, v15.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v15.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sqrdmulh v8.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mul v10.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mul v18.4S, v21.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mls v20.4S, v24.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mls v27.4S, v22.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v16.4S, v15.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + add v15.4S, v23.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... + cmge v22.4S, v20.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v28.4S, v17.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + str q27, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + sub v27.4S, v23.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v18.4S, v21.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v24.4S, v31.4S, v20.4S // ........................................................................................................................................................................................*............................................................................................... + mls v10.4S, v8.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + cmge v8.4S, v28.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v23.4S, v31.4S, v28.4S // ....................................................................................................................................................................................................*................................................................................... + sub v22.4S, v24.4S, v22.4S // ..........................................................................................................................................................................................*............................................................................................. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v24.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + cmge v19.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + sub v8.4S, v23.4S, v8.4S // ......................................................................................................................................................................................................*................................................................................. + mls v20.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v22.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + sub v24.4S, v24.4S, v19.4S // ..................................................................................................................................................................................................................................................*..................................... + mls v28.4S, v8.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v8.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v23.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v16.4S, v24.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v24.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + str q20, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sub v20.4S, v8.4S, v22.4S // ..........................................................................................................................................................................................................*............................................................................. + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + str q28, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + sub v8.4S, v24.4S, v23.4S // ..........................................................................................................................................................................................................................................................*............................. + sqrdmulh v22.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sqrdmulh v16.4S, v27.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mls v19.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mls v10.4S, v8.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sqrdmulh v23.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + cmge v8.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + cmge v14.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + str q10, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sqrdmulh v28.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sqrdmulh v9.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + cmge v24.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + str q18, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. + cmge v18.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sqrdmulh v12.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mul v15.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v24.4S, v24.4S, v18.4S // ......................................................................................................................................................................................................................................................*................................. + mul v27.4S, v27.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + mls v27.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + mls v20.4S, v9.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sub v9.4S, v14.4S, v8.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v21.4S, v28.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v14.4S, v31.4S, v27.4S // ............................................................................................................................................................................................................*........................................................................... + mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + cmge v10.4S, v27.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v28.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + cmge v17.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mls v15.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v16.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + cmge v12.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + sub v28.4S, v17.4S, v28.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v22.4S, v23.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v10.4S, v14.4S, v10.4S // ..............................................................................................................................................................................................................*......................................................................... + mls v19.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + sub v14.4S, v16.4S, v12.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v17.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v20.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + cmge v16.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v28.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mls v21.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + cmge v14.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + sub v19.4S, v16.4S, v17.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v27.4S, v10.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + sub v14.4S, v14.4S, v28.4S // ..........................................................................................................................................................................................................................................................................*............. + mls v15.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + mls v22.4S, v14.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + str q27, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + str q15, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s new file mode 100644 index 00000000..c23659d1 --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm.s @@ -0,0 +1,1934 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm + .global _intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: +_intt_dilithium_1234_5678_manual_ld4_opt_m1_firestorm: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q11, [x0, #0] // ..*..................................... + ldr q9, [x0, #48] // ...*.................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q17, [x0, #16] // .*...................................... + ldr q2, [x0, #32] // ....*................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q1, [x3, #32] // *....................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn1 v28.4S, v11.4S, v17.4S // .......*................................ + trn1 v12.4S, v2.4S, v9.4S // .........*.............................. + trn2 v8.4S, v11.4S, v17.4S // ........*............................... + trn2 v23.4S, v2.4S, v9.4S // ..........*............................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn2 v17.2D, v8.2D, v23.2D // ..............*......................... + trn2 v11.2D, v28.2D, v12.2D // ...............*........................ + trn1 v15.2D, v28.2D, v12.2D // ...........*............................ + ldr q0, [x3, #80] // .....*.................................. + trn1 v20.2D, v8.2D, v23.2D // ............*........................... + ldr q16, [x3, #64] // ......*................................. + ldr q25, [x3, #48] // .............*.......................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v10.4S, v11.4S, v17.4S // ...................*.................... + sub v18.4S, v15.4S, v20.4S // ................*....................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v28.4S, v11.4S, v17.4S // .........................*.............. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v24.4S, v18.4S, v1.4S // .....................*.................. + sqrdmulh v0.4S, v10.4S, v0.4S // .......................*................ + mul v30.4S, v10.4S, v16.4S // ......................*................. + sqrdmulh v17.4S, v18.4S, v25.4S // ....................*................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q31, [x3], #(6*16) // ..................*..................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v7.4S, v15.4S, v20.4S // .................*...................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v30.4S, v0.4S, v29.4S // ...........................*............ + mls v24.4S, v17.4S, v29.4S // ........................*............... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + ldr q15, [x3, #-80] // ..........................*............. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v6.4S, v7.4S, v28.4S // ............................*........... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v27.4S, v24.4S, v30.4S // ...............................*........ + add v4.4S, v24.4S, v30.4S // ................................*....... + add v24.4S, v7.4S, v28.4S // ..............................*......... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v10.4S, v6.4S, v31.4S // .................................*...... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v0.4S, v27.4S, v31.4S // ..................................*..... + sqrdmulh v26.4S, v6.4S, v15.4S // .............................*.......... + sqrdmulh v16.4S, v27.4S, v15.4S // ...................................*.... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn2 v5.4S, v24.4S, v4.4S // .....................................*.. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + trn1 v4.4S, v24.4S, v4.4S // ....................................*... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v10.4S, v26.4S, v29.4S // ......................................*. + mls v0.4S, v16.4S, v29.4S // .......................................* + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + + // original source code + // ldr q21, [x3, #32] // ....*................................... + // ldr q20, [x0, #16] // ..*..................................... + // ldr q19, [x0, #0] // *....................................... + // ldr q7, [x0, #48] // .*...................................... + // ldr q1, [x0, #32] // ...*.................................... + // ldr q0, [x3, #80] // ............*........................... + // ldr q16, [x3, #64] // ..............*......................... + // trn1 v30.4S, v19.4S, v20.4S // .....*.................................. + // trn2 v25.4S, v19.4S, v20.4S // .......*................................ + // trn1 v3.4S, v1.4S, v7.4S // ......*................................. + // trn2 v2.4S, v1.4S, v7.4S // ........*............................... + // trn1 v26.2D, v30.2D, v3.2D // ...........*............................ + // trn1 v23.2D, v25.2D, v2.2D // .............*.......................... + // ldr q6, [x3, #48] // ...............*........................ + // trn2 v1.2D, v25.2D, v2.2D // .........*.............................. + // trn2 v24.2D, v30.2D, v3.2D // ..........*............................. + // sub v18.4S, v26.4S, v23.4S // .................*...................... + // add v25.4S, v26.4S, v23.4S // ........................*............... + // ldr q20, [x3], #(6*16) // .......................*................ + // sub v4.4S, v24.4S, v1.4S // ................*....................... + // sqrdmulh v27.4S, v18.4S, v6.4S // ......................*................. + // mul v3.4S, v18.4S, v21.4S // ...................*.................... + // mul v14.4S, v4.4S, v16.4S // .....................*.................. + // sqrdmulh v16.4S, v4.4S, v0.4S // ....................*................... + // mls v3.4S, v27.4S, v29.4S // ..........................*............. + // add v27.4S, v24.4S, v1.4S // ..................*..................... + // ldr q1, [x3, #-80] // ...........................*............ + // mls v14.4S, v16.4S, v29.4S // .........................*.............. + // sub v9.4S, v25.4S, v27.4S // ............................*........... + // sqrdmulh v16.4S, v9.4S, v1.4S // ..................................*..... + // add v30.4S, v25.4S, v27.4S // ...............................*........ + // sub v22.4S, v3.4S, v14.4S // .............................*.......... + // add v13.4S, v3.4S, v14.4S // ..............................*......... + // mul v10.4S, v9.4S, v20.4S // ................................*....... + // mul v0.4S, v22.4S, v20.4S // .................................*...... + // sqrdmulh v12.4S, v22.4S, v1.4S // ...................................*.... + // trn1 v4.4S, v30.4S, v13.4S // .....................................*.. + // trn2 v5.4S, v30.4S, v13.4S // ....................................*... + // mls v10.4S, v16.4S, v29.4S // ......................................*. + // mls v0.4S, v12.4S, v29.4S // .......................................* + + sub count, count, #1 +layer5678_start: + ldr q21, [x3, #32] // ..............e............................................................. + ldr q20, [x0, #80] // .e.......................................................................... + ldr q19, [x0, #64] // e........................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q7, [x0, #112] // ...e........................................................................ + ldr q1, [x0, #96] // ..e......................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn2 v28.4S, v10.4S, v0.4S // .........................................*.................................. + trn1 v23.4S, v10.4S, v0.4S // ........................................*................................... + ldr q0, [x3, #80] // .................e.......................................................... + ldr q16, [x3, #64] // ................e........................................................... + ldr q22, [x4], #8 // ..............................................*............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v30.4S, v19.4S, v20.4S // ....e....................................................................... + trn2 v25.4S, v19.4S, v20.4S // .....e...................................................................... + ldr q20, [x4], #16 // ...............................................*............................ + trn2 v10.2D, v5.2D, v28.2D // ...........................................*................................ + trn2 v8.2D, v4.2D, v23.2D // ..........................................*................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v3.4S, v1.4S, v7.4S // ......e..................................................................... + trn2 v2.4S, v1.4S, v7.4S // .......e.................................................................... + trn1 v7.2D, v4.2D, v23.2D // ............................................*............................... + trn1 v6.2D, v5.2D, v28.2D // .............................................*.............................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v17.4S, v8.4S, v10.4S // ......................................................*..................... + sub v18.4S, v8.4S, v10.4S // .....................................................*...................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v28.4S, v7.4S, v6.4S // ................................................*........................... + add v5.4S, v7.4S, v6.4S // .................................................*.......................... + trn1 v26.2D, v30.2D, v3.2D // ..........e................................................................. + trn1 v23.2D, v25.2D, v2.2D // ...........e................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + ldr q6, [x3, #48] // ...............e............................................................ + trn2 v1.2D, v25.2D, v2.2D // .........e.................................................................. + mul v31.4S, v18.4S, v20.S[2] // .......................................................*.................... + sqrdmulh v7.4S, v18.4S, v20.S[3] // ........................................................*................... + trn2 v24.2D, v30.2D, v3.2D // ........e................................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v18.4S, v26.4S, v23.4S // ..................e......................................................... + add v25.4S, v26.4S, v23.4S // ...................e........................................................ + mul v2.4S, v28.4S, v20.S[0] // ..................................................*......................... + sqrdmulh v23.4S, v28.4S, v20.S[1] // ...................................................*........................ + ldr q20, [x3], #(6*16) // ............e............................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v4.4S, v24.4S, v1.4S // .......................e.................................................... + sub v8.4S, v5.4S, v17.4S // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + add v10.4S, v5.4S, v17.4S // ...........................................................*................ + mls v31.4S, v7.4S, v29.4S // .........................................................*.................. + sqrdmulh v27.4S, v18.4S, v6.4S // .....................e...................................................... + mul v3.4S, v18.4S, v21.4S // ....................e....................................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v14.4S, v4.4S, v16.4S // .........................e.................................................. + sqrdmulh v16.4S, v4.4S, v0.4S // ..........................e................................................. + mls v2.4S, v23.4S, v29.4S // ....................................................*....................... + mul v17.4S, v8.4S, v22.S[0] // ............................................................*............... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v12.4S, v8.4S, v22.S[1] // .............................................................*.............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + srshr v30.4S, v10.4S, #23 // ....................................................................*....... + mls v3.4S, v27.4S, v29.4S // ......................e..................................................... + add v27.4S, v24.4S, v1.4S // ........................e................................................... + ldr q1, [x3, #-80] // .............e.............................................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v4.4S, v2.4S, v31.4S // ...............................................................*............ + add v6.4S, v2.4S, v31.4S // ................................................................*........... + mls v14.4S, v16.4S, v29.4S // ...........................e................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v17.4S, v12.4S, v29.4S // ..............................................................*............. + sub v9.4S, v25.4S, v27.4S // ............................e............................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v10.4S, v30.4S, v29.4S // .....................................................................*...... + mul v19.4S, v4.4S, v22.S[0] // .................................................................*.......... + sqrdmulh v4.4S, v4.4S, v22.S[1] // ..................................................................*......... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + srshr v7.4S, v6.4S, #23 // ......................................................................*..... + sqrdmulh v16.4S, v9.4S, v1.4S // ...............................e............................................ + add v30.4S, v25.4S, v27.4S // .............................e.............................................. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sub v22.4S, v3.4S, v14.4S // .................................e.......................................... + add v13.4S, v3.4S, v14.4S // ..................................e......................................... + str q10, [x0], #(16*4) // ........................................................................*... + mul v10.4S, v9.4S, v20.4S // ..............................e............................................. + str q17, [x0, #-32] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mul v0.4S, v22.4S, v20.4S // ...................................e........................................ + mls v6.4S, v7.4S, v29.4S // .......................................................................*.... + sqrdmulh v12.4S, v22.4S, v1.4S // ....................................e....................................... + mls v19.4S, v4.4S, v29.4S // ...................................................................*........ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v4.4S, v30.4S, v13.4S // ......................................e..................................... + trn2 v5.4S, v30.4S, v13.4S // .......................................e.................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v10.4S, v16.4S, v29.4S // ................................e........................................... + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + mls v0.4S, v12.4S, v29.4S // .....................................e...................................... + str q6, [x0, #-48] // .........................................................................*.. + str q19, [x0, #-16] // ...........................................................................* + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + + // original source code + // ldr q8, [x0, #(16*0)] // ..e.........................................................................|.e......................................................................... + // ldr q9, [x0, #(16*1)] // .e..........................................................................|e.......................................................................... + // ldr q10, [x0, #(16*2)] // ....e.......................................................................|...e....................................................................... + // ldr q11, [x0, #(16*3)] // ...e........................................................................|..e........................................................................ + // trn1 v25.4s, v8.4s, v9.4s // ..........e.................................................................|.........e................................................................. + // trn2 v26.4s, v8.4s, v9.4s // ...........e................................................................|..........e................................................................ + // trn1 v27.4s, v10.4s, v11.4s // ...............e............................................................|..............e............................................................ + // trn2 v28.4s, v10.4s, v11.4s // ................e...........................................................|...............e........................................................... + // trn2 v10.2d, v25.2d, v27.2d // .............................e..............................................|............................e.............................................. + // trn2 v11.2d, v26.2d, v28.2d // ..........................e.................................................|.........................e................................................. + // trn1 v8.2d, v25.2d, v27.2d // .......................e....................................................|......................e.................................................... + // trn1 v9.2d, v26.2d, v28.2d // ........................e...................................................|.......................e................................................... + // ldr q0, [x3], #(6*16) // ..................................e.........................................|.................................e......................................... + // ldr q4, [x3, #(-6*16 + 1*16)] // .................................................e..........................|................................................e.......................... + // ldr q1, [x3, #(-6*16 + 2*16)] // e...........................................................................e........................................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // .........................e..................................................|........................e.................................................. + // ldr q2, [x3, #(-6*16 + 4*16)] // ........e...................................................................|.......e................................................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // .......e....................................................................|......e.................................................................... + // sub v24.4s, v8.4s, v9.4s // ..............................e.............................................|.............................e............................................. + // add v8.4s, v8.4s, v9.4s // ...............................e............................................|..............................e............................................ + // mul v9.4s, v24.4s, v1.4s // ........................................e...................................|.......................................e................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................e....................................|......................................e.................................... + // mls v9.4s, v24.4s, v29.4s // ...............................................e............................|..............................................e............................ + // sub v24.4s, v10.4s, v11.4s // ...................................e........................................|..................................e........................................ + // add v10.4s, v10.4s, v11.4s // ................................................e...........................|...............................................e........................... + // mul v11.4s, v24.4s, v2.4s // .........................................e..................................|........................................e.................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ..........................................e.................................|.........................................e................................. + // mls v11.4s, v24.4s, v29.4s // ....................................................e.......................|...................................................e....................... + // sub v24.4s, v8.4s, v10.4s // ......................................................e.....................|.....................................................e..................... + // add v8.4s, v8.4s, v10.4s // ............................................................e...............|...........................................................e............... + // mul v10.4s, v24.4s, v0.4s // ................................................................e...........|...............................................................e........... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...........................................................e................|..........................................................e................ + // mls v10.4s, v24.4s, v29.4s // ........................................................................e...|.......................................................................e... + // sub v24.4s, v9.4s, v11.4s // .............................................................e..............|............................................................e.............. + // add v9.4s, v9.4s, v11.4s // ..............................................................e.............|.............................................................e............. + // mul v11.4s, v24.4s, v0.4s // ..................................................................e.........|.................................................................e......... + // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................................e.......|...................................................................e....... + // mls v11.4s, v24.4s, v29.4s // .........................................................................e..|........................................................................e.. + // trn1 v25.4s, v8.4s, v9.4s // ......................................................................e.....|.....................................................................e..... + // trn2 v26.4s, v8.4s, v9.4s // .......................................................................e....|......................................................................e.... + // trn1 v27.4s, v10.4s, v11.4s // ......*.....................................................................|.....*..................................................................... + // trn2 v28.4s, v10.4s, v11.4s // .....*......................................................................|....*...................................................................... + // trn2 v10.2d, v25.2d, v27.2d // ..............*.............................................................|.............*............................................................. + // trn2 v11.2d, v26.2d, v28.2d // .............*..............................................................|............*.............................................................. + // trn1 v8.2d, v25.2d, v27.2d // .................*..........................................................|................*.......................................................... + // trn1 v9.2d, v26.2d, v28.2d // ..................*.........................................................|.................*......................................................... + // ldr q1, [x4], #8 // .........*..................................................................|........*.................................................................. + // ldr q0, [x4], #16 // ............*...............................................................|...........*............................................................... + // sub v24.4s, v8.4s, v9.4s // .....................*......................................................|....................*...................................................... + // add v8.4s, v8.4s, v9.4s // ......................*.....................................................|.....................*..................................................... + // mul v9.4s, v24.4s, v0.s[0] // ................................*...........................................|...............................*........................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................*..........................................|................................*.......................................... + // mls v9.4s, v24.4s, v29.4s // ...........................................*................................|..........................................*................................ + // sub v24.4s, v10.4s, v11.4s // ....................*.......................................................|...................*....................................................... + // add v10.4s, v10.4s, v11.4s // ...................*........................................................|..................*........................................................ + // mul v11.4s, v24.4s, v0.s[2] // ...........................*................................................|..........................*................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................*...............................................|...........................*............................................... + // mls v11.4s, v24.4s, v29.4s // ......................................*.....................................|.....................................*..................................... + // sub v24.4s, v8.4s, v10.4s // ....................................*.......................................|...................................*....................................... + // add v8.4s, v8.4s, v10.4s // .....................................*......................................|....................................*...................................... + // mul v10.4s, v24.4s, v1.s[0] // ............................................*...............................|...........................................*............................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................*..............................|............................................*.............................. + // mls v10.4s, v24.4s, v29.4s // .....................................................*......................|....................................................*...................... + // sub v24.4s, v9.4s, v11.4s // ..................................................*.........................|.................................................*......................... + // add v9.4s, v9.4s, v11.4s // ...................................................*........................|..................................................*........................ + // mul v11.4s, v24.4s, v1.s[0] // ........................................................*...................|.......................................................*................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................*..................|........................................................*.................. + // mls v11.4s, v24.4s, v29.4s // .....................................................................*......|....................................................................*...... + // srshr v24.4S, v8.4S, #23 // ..............................................*.............................|.............................................*............................. + // mls v8.4s, v24.4s, v29.4s // .......................................................*....................|......................................................*.................... + // srshr v24.4S, v9.4S, #23 // ..........................................................*.................|.........................................................*................. + // mls v9.4s, v24.4s, v29.4s // ...................................................................*........|..................................................................*........ + // str q8, [x0], #(16*4) // ...............................................................*............|..............................................................*............ + // str q9, [x0, #(-16*4 + 1*16)] // ..........................................................................*.|.........................................................................*. + // str q10, [x0, #(-16*4 + 2*16)] // .................................................................*..........|................................................................*.......... + // str q11, [x0, #(-16*4 + 3*16)] // ...........................................................................*|..........................................................................* + + sub count, count, #1 + cbnz count, layer5678_start + trn1 v30.4S, v10.4S, v0.4S // .*.................................. + trn2 v10.4S, v10.4S, v0.4S // *................................... + ldr q14, [x4], #8 // ..*................................. + ldr q19, [x4], #16 // ...*................................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + trn2 v1.2D, v4.2D, v30.2D // .....*.............................. + trn1 v30.2D, v4.2D, v30.2D // ......*............................. + trn2 v24.2D, v5.2D, v10.2D // ....*............................... + trn1 v4.2D, v5.2D, v10.2D // .......*............................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sub v10.4S, v1.4S, v24.4S // .........*.......................... + sub v3.4S, v30.4S, v4.4S // ..........*......................... + add v30.4S, v30.4S, v4.4S // ...........*........................ + add v17.4S, v1.4S, v24.4S // ........*........................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mul v27.4S, v10.4S, v19.S[2] // ............*....................... + sqrdmulh v7.4S, v10.4S, v19.S[3] // .............*...................... + mul v0.4S, v3.4S, v19.S[0] // ..............*..................... + sqrdmulh v24.4S, v3.4S, v19.S[1] // ...............*.................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sub v4.4S, v30.4S, v17.4S // ................*................... + add v16.4S, v30.4S, v17.4S // .................*.................. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v27.4S, v7.4S, v29.4S // ..................*................. + mls v0.4S, v24.4S, v29.4S // ...................*................ + srshr v26.4S, v16.4S, #23 // ......................*............. + sqrdmulh v30.4S, v4.4S, v14.S[1] // .....................*.............. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mul v6.4S, v4.4S, v14.S[0] // ....................*............... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sub v4.4S, v0.4S, v27.4S // .......................*............ + mls v16.4S, v26.4S, v29.4S // ..........................*......... + add v24.4S, v0.4S, v27.4S // ........................*........... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v6.4S, v30.4S, v29.4S // .........................*.......... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mul v10.4S, v4.4S, v14.S[0] // ...........................*........ + sqrdmulh v30.4S, v4.4S, v14.S[1] // ............................*....... + srshr v28.4S, v24.4S, #23 // .............................*...... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q16, [x0], #(16*4) // ..............................*..... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q6, [x0, #-32] // ...............................*.... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v24.4S, v28.4S, v29.4S // ................................*... + mls v10.4S, v30.4S, v29.4S // .................................*.. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q24, [x0, #-48] // ..................................*. + str q10, [x0, #-16] // ...................................* + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + + // original source code + // trn2 v28.4S, v10.4S, v0.4S // .*.................................. + // trn1 v23.4S, v10.4S, v0.4S // *................................... + // ldr q22, [x4], #8 // ..*................................. + // ldr q20, [x4], #16 // ...*................................ + // trn2 v10.2D, v5.2D, v28.2D // ......*............................. + // trn2 v8.2D, v4.2D, v23.2D // ....*............................... + // trn1 v7.2D, v4.2D, v23.2D // .....*.............................. + // trn1 v6.2D, v5.2D, v28.2D // .......*............................ + // add v17.4S, v8.4S, v10.4S // ...........*........................ + // sub v18.4S, v8.4S, v10.4S // ........*........................... + // sub v28.4S, v7.4S, v6.4S // .........*.......................... + // add v5.4S, v7.4S, v6.4S // ..........*......................... + // mul v31.4S, v18.4S, v20.S[2] // ............*....................... + // sqrdmulh v7.4S, v18.4S, v20.S[3] // .............*...................... + // mul v2.4S, v28.4S, v20.S[0] // ..............*..................... + // sqrdmulh v23.4S, v28.4S, v20.S[1] // ...............*.................... + // sub v8.4S, v5.4S, v17.4S // ................*................... + // add v10.4S, v5.4S, v17.4S // .................*.................. + // mls v31.4S, v7.4S, v29.4S // ..................*................. + // mls v2.4S, v23.4S, v29.4S // ...................*................ + // mul v17.4S, v8.4S, v22.S[0] // ......................*............. + // sqrdmulh v12.4S, v8.4S, v22.S[1] // .....................*.............. + // srshr v30.4S, v10.4S, #23 // ....................*............... + // sub v4.4S, v2.4S, v31.4S // .......................*............ + // add v6.4S, v2.4S, v31.4S // .........................*.......... + // mls v17.4S, v12.4S, v29.4S // ..........................*......... + // mls v10.4S, v30.4S, v29.4S // ........................*........... + // mul v19.4S, v4.4S, v22.S[0] // ...........................*........ + // sqrdmulh v4.4S, v4.4S, v22.S[1] // ............................*....... + // srshr v7.4S, v6.4S, #23 // .............................*...... + // str q10, [x0], #(16*4) // ..............................*..... + // str q17, [x0, #-32] // ...............................*.... + // mls v6.4S, v7.4S, v29.4S // ................................*... + // mls v19.4S, v4.4S, v29.4S // .................................*.. + // str q6, [x0, #-48] // ..................................*. + // str q19, [x0, #-16] // ...................................* + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q17, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + ldr q12, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q27, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q16, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q18, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q9, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q14, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q11, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q22, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q15, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q28, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + sub v8.4S, v27.4S, v12.4S // ...............................*........................................................................................................................................................................................................................................................ + ldr q19, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + add v23.4S, v27.4S, v12.4S // ................................*....................................................................................................................................................................................................................................................... + sub v27.4S, v17.4S, v18.4S // ................*....................................................................................................................................................................................................................................................................... + sub v21.4S, v16.4S, v9.4S // ....................................*................................................................................................................................................................................................................................................... + add v16.4S, v16.4S, v9.4S // .....................................*.................................................................................................................................................................................................................................................. + sub v12.4S, v11.4S, v14.4S // ..........................*............................................................................................................................................................................................................................................................. + add v20.4S, v11.4S, v14.4S // ...........................*............................................................................................................................................................................................................................................................ + mul v11.4S, v8.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v8.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + mul v9.4S, v27.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v27.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + sub v27.4S, v28.4S, v15.4S // .....................*.................................................................................................................................................................................................................................................................. + sqrdmulh v14.4S, v21.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + add v8.4S, v28.4S, v15.4S // ......................*................................................................................................................................................................................................................................................................. + mul v21.4S, v21.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + sub v28.4S, v22.4S, v19.4S // .........................................*.............................................................................................................................................................................................................................................. + add v19.4S, v22.4S, v19.4S // ..........................................*............................................................................................................................................................................................................................................. + sqrdmulh v15.4S, v27.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + mul v22.4S, v27.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + ldr q13, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mls v21.4S, v14.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + ldr q14, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + mls v11.4S, v10.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + ldr q10, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + add v27.4S, v20.4S, v23.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v9.4S, v24.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sub v23.4S, v20.4S, v23.4S // ..................................................................*..................................................................................................................................................................................................................... + ldr q20, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + add v24.4S, v17.4S, v18.4S // .................*...................................................................................................................................................................................................................................................................... + mul v18.4S, v28.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v17.4S, v28.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + add v28.4S, v16.4S, v19.4S // .............................................................................*.......................................................................................................................................................................................................... + mls v22.4S, v15.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + mul v15.4S, v12.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + sub v19.4S, v16.4S, v19.4S // ............................................................................*........................................................................................................................................................................................................... + sqrdmulh v12.4S, v12.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + sub v16.4S, v14.4S, v13.4S // ..............................................*......................................................................................................................................................................................................................................... + add v13.4S, v14.4S, v13.4S // ...............................................*........................................................................................................................................................................................................................................ + mul v14.4S, v23.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + mls v18.4S, v17.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + sub v17.4S, v10.4S, v20.4S // ...................................................*.................................................................................................................................................................................................................................... + add v20.4S, v10.4S, v20.4S // ....................................................*................................................................................................................................................................................................................................... + sub v10.4S, v24.4S, v8.4S // ........................................................*............................................................................................................................................................................................................................... + mls v14.4S, v23.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + add v23.4S, v13.4S, v20.4S // .......................................................................................*................................................................................................................................................................................................ + sub v20.4S, v13.4S, v20.4S // ......................................................................................*................................................................................................................................................................................................. + mls v15.4S, v12.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + add v12.4S, v9.4S, v22.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v13.4S, v9.4S, v22.4S // .............................................................*.......................................................................................................................................................................................................................... + add v24.4S, v24.4S, v8.4S // .........................................................*.............................................................................................................................................................................................................................. + mul v22.4S, v17.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v8.4S, v17.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + add v17.4S, v15.4S, v11.4S // ........................................................................*............................................................................................................................................................................................................... + sub v11.4S, v15.4S, v11.4S // .......................................................................*................................................................................................................................................................................................................ + sqrdmulh v15.4S, v19.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mul v19.4S, v19.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + sub v9.4S, v24.4S, v27.4S // ................................................................................................*....................................................................................................................................................................................... + add v24.4S, v24.4S, v27.4S // .................................................................................................*...................................................................................................................................................................................... + mul v27.4S, v20.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + mls v22.4S, v8.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + sqrdmulh v8.4S, v20.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + sub v20.4S, v21.4S, v18.4S // .................................................................................*...................................................................................................................................................................................................... + add v21.4S, v21.4S, v18.4S // ..................................................................................*..................................................................................................................................................................................................... + mls v19.4S, v15.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + mul v18.4S, v16.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v16.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + add v16.4S, v28.4S, v23.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v28.4S, v28.4S, v23.4S // ....................................................................................................................*................................................................................................................................................................... + mul v23.4S, v10.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + mls v27.4S, v8.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + sqrdmulh v8.4S, v10.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + mls v18.4S, v15.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sub v15.4S, v19.4S, v27.4S // ..............................................................................................................................*......................................................................................................................................................... + mls v23.4S, v8.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + add v27.4S, v19.4S, v27.4S // ...............................................................................................................................*........................................................................................................................................................ + add v8.4S, v24.4S, v16.4S // .........................................................................................................................................*.............................................................................................................................................. + sub v16.4S, v24.4S, v16.4S // ........................................................................................................................................*............................................................................................................................................... + sqrdmulh v24.4S, v13.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + mul v13.4S, v13.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mul v19.4S, v11.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + add v10.4S, v23.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v14.4S, v23.4S, v14.4S // ..........................................................................................................*............................................................................................................................................................................. + sqrdmulh v23.4S, v11.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + sub v11.4S, v18.4S, v22.4S // ...........................................................................................*............................................................................................................................................................................................ + add v18.4S, v18.4S, v22.4S // ............................................................................................*........................................................................................................................................................................................... + mls v13.4S, v24.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + mul v24.4S, v14.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mls v19.4S, v23.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + sqrdmulh v22.4S, v16.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + mul v16.4S, v16.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + mls v24.4S, v14.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mul v23.4S, v11.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + sqrdmulh v14.4S, v11.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + add v11.4S, v13.4S, v19.4S // ................................................................................................................*....................................................................................................................................................................... + sub v13.4S, v13.4S, v19.4S // ...............................................................................................................*........................................................................................................................................................................ + sub v19.4S, v12.4S, v17.4S // .....................................................................................................*.................................................................................................................................................................................. + mls v16.4S, v22.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + add v22.4S, v12.4S, v17.4S // ......................................................................................................*................................................................................................................................................................................. + sqrdmulh v17.4S, v9.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + sub v12.4S, v21.4S, v18.4S // .........................................................................................................................*.............................................................................................................................................................. + add v18.4S, v21.4S, v18.4S // ..........................................................................................................................*............................................................................................................................................................. + mul v9.4S, v9.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sqrdmulh v21.4S, v20.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mul v20.4S, v20.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + mls v23.4S, v14.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + mul v14.4S, v15.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mls v9.4S, v17.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + sqrdmulh v17.4S, v12.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + mls v20.4S, v21.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + mul v21.4S, v12.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + mul v12.4S, v28.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + sqrdmulh v28.4S, v28.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mls v14.4S, v15.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mul v15.4S, v19.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + sqrdmulh v19.4S, v19.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mls v21.4S, v17.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + mls v12.4S, v28.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + cmge v17.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + mls v15.4S, v19.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sub v19.4S, v10.4S, v27.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v27.4S // ...................................................................................................................................................*.................................................................................................................................... + add v27.4S, v20.4S, v23.4S // ....................................................................................................................................*................................................................................................................................................... + sub v23.4S, v20.4S, v23.4S // ...................................................................................................................................*.................................................................................................................................................... + sub v17.4S, v28.4S, v17.4S // ..................................................................................................................................................................................*..................................................................................................... + sub count, count, #1 +layer1234_start: + mls v16.4S, v17.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v17.4S, v22.4S, v18.4S // .............................................................................................................................................*.......................................................................................................................................... + sub v20.4S, v9.4S, v12.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v9.4S, v12.4S // .............................................................................................................................................................*.......................................................................................................................... + add v9.4S, v22.4S, v18.4S // ..............................................................................................................................................*......................................................................................................................................... + mul v28.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + sqrdmulh v22.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + sub v18.4S, v15.4S, v21.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v15.4S, v21.4S // ..................................................................................................................................................................*..................................................................................................................... + mul v21.4S, v20.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v20.4S, v20.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + sub v15.4S, v24.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v24.4S, v14.4S // .......................................................................................................................................................................*................................................................................................................ + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v24.4S, v23.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v16.4S, v18.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mls v28.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sqrdmulh v22.4S, v23.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + sub v23.4S, v11.4S, v27.4S // .......................................................................................................................................................*................................................................................................................................ + add v11.4S, v11.4S, v27.4S // ........................................................................................................................................................*............................................................................................................................... + mls v21.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mul v27.4S, v18.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v20.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + sqrdmulh v19.4S, v17.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + mul v17.4S, v17.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + mls v24.4S, v22.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mul v22.4S, v15.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mls v27.4S, v16.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + mls v17.4S, v19.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + cmge v19.4S, v31.4S, v21.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v16.4S, v21.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v18.4S, v20.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + sqrdmulh v20.4S, v15.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + add v15.4S, v28.4S, v24.4S // ............................................................................................................................................................................*........................................................................................................... + sub v24.4S, v28.4S, v24.4S // ...........................................................................................................................................................................*............................................................................................................ + sub v28.4S, v19.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... + mul v19.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v16.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mul v23.4S, v24.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v24.4S, v24.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mls v21.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v28.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v22.4S, v20.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v20.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + mls v19.4S, v16.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sub v20.4S, v20.4S, v28.4S // ..........................................................................................................................................................................................*............................................................................................. + cmge v28.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + cmge v24.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + str q21, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v21.4S, v28.4S, v24.4S // ......................................................................................................................................................................................*................................................................................................. + cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v20.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v8.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v24.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + mls v17.4S, v21.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sub v28.4S, v24.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + sqrdmulh v24.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sub v13.4S, v20.4S, v8.4S // ..............................................................................................................................................................................................................*......................................................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + cmge v8.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v18.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + mls v23.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + sqrdmulh v13.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + mls v19.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v21.4S, v24.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sub v24.4S, v18.4S, v8.4S // ..........................................................................................................................................................................................................*............................................................................. + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + cmge v10.4S, v27.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + sqrdmulh v8.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + sqrdmulh v28.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + mls v22.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + cmge v15.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v12.4S, v31.4S, v27.4S // ....................................................................................................................................................................................................*................................................................................... + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + cmge v24.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sub v10.4S, v12.4S, v10.4S // ......................................................................................................................................................................................................*................................................................................. + cmge v12.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mls v18.4S, v13.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sqrdmulh v13.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + sub v11.4S, v11.4S, v15.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v15.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sub v14.4S, v12.4S, v24.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v27.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + mls v19.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + cmge v28.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + cmge v9.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v16.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v8.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v10.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v12.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + mls v22.4S, v13.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v13.4S, v9.4S, v15.4S // ......................................................................................................................................................................................................................................................................*................. + sub v24.4S, v8.4S, v28.4S // ..........................................................................................................................................................................................................................................................*............................. + str q27, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + ldr q9, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... + cmge v8.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + ldr q16, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + mls v20.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + sub v14.4S, v12.4S, v10.4S // ..............................................................................................................................................................................................................................................................................*......... + ldr q10, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + cmge v15.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + ldr q12, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + ldr q11, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sub v24.4S, v27.4S, v8.4S // ......................................................................................................................................................................................................................................................*................................. + cmge v8.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v27.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + mls v23.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + ldr q14, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + add v24.4S, v9.4S, v16.4S // ...........................e............................................................................................................................................................................................................................................................ + sub v13.4S, v9.4S, v16.4S // ..........................e............................................................................................................................................................................................................................................................. + sub v16.4S, v27.4S, v28.4S // ..............................................................................................................................................................................................................................................................*......................... + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v27.4S, v10.4S, v12.4S // ...............................e........................................................................................................................................................................................................................................................ + sub v15.4S, v8.4S, v15.4S // ..........................................................................................................................................................................................................................................................................*............. + sub v18.4S, v11.4S, v14.4S // ....................................e................................................................................................................................................................................................................................................... + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + add v20.4S, v11.4S, v14.4S // .....................................e.................................................................................................................................................................................................................................................. + add v8.4S, v10.4S, v12.4S // ................................e....................................................................................................................................................................................................................................................... + ldr q10, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + ldr q14, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + ldr q28, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + ldr q12, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... + mls v19.4S, v16.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + sqrdmulh v9.4S, v27.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + ldr q17, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + ldr q16, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + mul v11.4S, v27.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + add v27.4S, v24.4S, v8.4S // ...................................................................e.................................................................................................................................................................................................................... + mul v21.4S, v18.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + sub v23.4S, v24.4S, v8.4S // ..................................................................e..................................................................................................................................................................................................................... + sqrdmulh v8.4S, v18.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + mul v18.4S, v13.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v13.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + mls v22.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + add v15.4S, v28.4S, v10.4S // ..........................................e............................................................................................................................................................................................................................................. + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + sub v19.4S, v28.4S, v10.4S // .........................................e.............................................................................................................................................................................................................................................. + mul v10.4S, v23.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + add v13.4S, v16.4S, v14.4S // ......................e................................................................................................................................................................................................................................................................. + mls v11.4S, v9.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + sub v9.4S, v16.4S, v14.4S // .....................e.................................................................................................................................................................................................................................................................. + ldr q28, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + mul v14.4S, v19.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + mls v18.4S, v24.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + mls v21.4S, v8.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + sqrdmulh v16.4S, v19.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + ldr q24, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + mul v19.4S, v9.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + sqrdmulh v22.4S, v9.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + add v9.4S, v17.4S, v12.4S // .................e...................................................................................................................................................................................................................................................................... + sub v17.4S, v17.4S, v12.4S // ................e....................................................................................................................................................................................................................................................................... + ldr q12, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + ldr q8, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + mls v10.4S, v23.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + sub v23.4S, v20.4S, v15.4S // ............................................................................e........................................................................................................................................................................................................... + mls v14.4S, v16.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + add v16.4S, v20.4S, v15.4S // .............................................................................e.......................................................................................................................................................................................................... + add v20.4S, v9.4S, v13.4S // .........................................................e.............................................................................................................................................................................................................................. + mls v19.4S, v22.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + add v22.4S, v24.4S, v28.4S // ...............................................e........................................................................................................................................................................................................................................ + sub v15.4S, v24.4S, v28.4S // ..............................................e......................................................................................................................................................................................................................................... + sub v24.4S, v9.4S, v13.4S // ........................................................e............................................................................................................................................................................................................................... + add v13.4S, v8.4S, v12.4S // ....................................................e................................................................................................................................................................................................................................... + sub v8.4S, v8.4S, v12.4S // ...................................................e.................................................................................................................................................................................................................................... + sub v28.4S, v22.4S, v13.4S // ......................................................................................e................................................................................................................................................................................................. + add v13.4S, v22.4S, v13.4S // .......................................................................................e................................................................................................................................................................................................ + mul v22.4S, v17.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + mul v12.4S, v15.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + sub v9.4S, v20.4S, v27.4S // ................................................................................................e....................................................................................................................................................................................... + add v27.4S, v20.4S, v27.4S // .................................................................................................e...................................................................................................................................................................................... + sub v20.4S, v16.4S, v13.4S // ....................................................................................................................e................................................................................................................................................................... + add v16.4S, v16.4S, v13.4S // .....................................................................................................................e.................................................................................................................................................................. + mul v13.4S, v24.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + sqrdmulh v24.4S, v24.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mls v22.4S, v17.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + add v17.4S, v18.4S, v11.4S // ........................................................................e............................................................................................................................................................................................................... + mls v12.4S, v15.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + sub v18.4S, v18.4S, v11.4S // .......................................................................e................................................................................................................................................................................................................ + mul v15.4S, v8.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sqrdmulh v11.4S, v8.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + sqrdmulh v8.4S, v28.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + mls v13.4S, v24.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + add v24.4S, v22.4S, v19.4S // ..............................................................e......................................................................................................................................................................................................................... + sub v19.4S, v22.4S, v19.4S // .............................................................e.......................................................................................................................................................................................................................... + mul v22.4S, v28.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + mls v15.4S, v11.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mul v11.4S, v19.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sqrdmulh v28.4S, v19.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + sqrdmulh v19.4S, v23.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + mul v23.4S, v23.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + mls v11.4S, v28.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + mls v23.4S, v19.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + add v28.4S, v12.4S, v15.4S // ............................................................................................e........................................................................................................................................................................................... + sub v15.4S, v12.4S, v15.4S // ...........................................................................................e............................................................................................................................................................................................ + sub v12.4S, v27.4S, v16.4S // ........................................................................................................................................e............................................................................................................................................... + mls v22.4S, v8.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + add v8.4S, v27.4S, v16.4S // .........................................................................................................................................e.............................................................................................................................................. + add v19.4S, v13.4S, v10.4S // ...........................................................................................................e............................................................................................................................................................................ + sub v13.4S, v13.4S, v10.4S // ..........................................................................................................e............................................................................................................................................................................. + mul v16.4S, v12.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. + sqrdmulh v12.4S, v12.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ + sub v10.4S, v24.4S, v17.4S // .....................................................................................................e.................................................................................................................................................................................. + add v27.4S, v23.4S, v22.4S // ...............................................................................................................................e........................................................................................................................................................ + sub v23.4S, v23.4S, v22.4S // ..............................................................................................................................e......................................................................................................................................................... + add v22.4S, v24.4S, v17.4S // ......................................................................................................e................................................................................................................................................................................. + mul v24.4S, v13.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... + sqrdmulh v17.4S, v18.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + mul v18.4S, v18.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... + mls v16.4S, v12.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... + mul v12.4S, v20.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mls v24.4S, v13.4S, v29.4S // ..............................................................................................................e......................................................................................................................................................................... + mls v18.4S, v17.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + sqrdmulh v13.4S, v9.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v9.4S, v9.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + sub v17.4S, v21.4S, v14.4S // .................................................................................e...................................................................................................................................................................................................... + mls v12.4S, v20.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + mul v20.4S, v15.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + add v21.4S, v21.4S, v14.4S // ..................................................................................e..................................................................................................................................................................................................... + mul v14.4S, v23.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... + mls v9.4S, v13.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + sub v13.4S, v11.4S, v18.4S // ...............................................................................................................e........................................................................................................................................................................ + add v11.4S, v11.4S, v18.4S // ................................................................................................................e....................................................................................................................................................................... + add v18.4S, v21.4S, v28.4S // ..........................................................................................................................e............................................................................................................................................................. + sub v21.4S, v21.4S, v28.4S // .........................................................................................................................e.............................................................................................................................................................. + mul v28.4S, v17.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + mls v20.4S, v15.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + mul v15.4S, v10.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + sqrdmulh v10.4S, v10.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + mls v14.4S, v23.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... + mls v28.4S, v17.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + sqrdmulh v23.4S, v21.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mul v21.4S, v21.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + cmge v17.4S, v31.4S, v16.4S // ................................................................................................................................................................................e....................................................................................................... + mls v15.4S, v10.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + add v10.4S, v19.4S, v27.4S // ...................................................................................................................................................e.................................................................................................................................... + sub v19.4S, v19.4S, v27.4S // ..................................................................................................................................................e..................................................................................................................................... + cmge v27.4S, v16.4S, v30.4S // .................................................................................................................................................................................e...................................................................................................... + mls v21.4S, v23.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... + sub v17.4S, v17.4S, v27.4S // ..................................................................................................................................................................................e..................................................................................................... + add v27.4S, v28.4S, v20.4S // ....................................................................................................................................e................................................................................................................................................... + sub v23.4S, v28.4S, v20.4S // ...................................................................................................................................e.................................................................................................................................................... + + // original source code + // ldr q8, [x1, #0] // ...................................e...............................................................................................................................|.......................................................................................................................................................e..................... + // ldr q9, [x1, #(1*(512/8))] // ...............................e...................................................................................................................................|...................................................................................................................................................e......................... + // ldr q10, [x1, #(2*(512/8))] // ....................................e..............................................................................................................................|........................................................................................................................................................e.................... + // ldr q11, [x1, #(3*(512/8))] // .............................e.....................................................................................................................................|.................................................................................................................................................e........................... + // ldr q12, [x1, #(4*(512/8))] // e..................................................................................................................................................................|....................................................................................................................e........................................................ + // ldr q13, [x1, #(5*(512/8))] // ..e................................................................................................................................................................|......................................................................................................................e...................................................... + // ldr q14, [x1, #(6*(512/8))] // .....e.............................................................................................................................................................|.........................................................................................................................e................................................... + // ldr q15, [x1, #(7*(512/8))] // .......e...........................................................................................................................................................|...........................................................................................................................e................................................. + // ldr q16, [x1, #(8*(512/8))] // .........e.........................................................................................................................................................|.............................................................................................................................e............................................... + // ldr q17, [x1, #(9*(512/8))] // ...............e...................................................................................................................................................|...................................................................................................................................e......................................... + // ldr q18, [x1, #(10*(512/8))] // ..............................e....................................................................................................................................|..................................................................................................................................................e.......................... + // ldr q19, [x1, #(11*(512/8))] // ............................e......................................................................................................................................|................................................................................................................................................e............................ + // ldr q20, [x1, #(12*(512/8))] // .............................................................e.....................................................................................................|............................................................................................................................................................................. + // ldr q21, [x1, #(13*(512/8))] // .......................................................e...........................................................................................................|...........................................................................................................................................................................e. + // ldr q22, [x1, #(14*(512/8))] // ...................................................................e...............................................................................................|............................................................................................................................................................................. + // ldr q23, [x1, #(15*(512/8))] // ..................................................................e................................................................................................|............................................................................................................................................................................. + // sub v24.4s, v8.4s, v9.4s // .................................................................e.................................................................................................|............................................................................................................................................................................. + // add v8.4s, v8.4s, v9.4s // ................................................................e..................................................................................................|............................................................................................................................................................................. + // mul v9.4s, v24.4s, v3.s[2] // .................................................................................e.................................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ..................................................................................e................................................................................|............................................................................................................................................................................. + // mls v9.4s, v24.4s, v29.4s // ...........................................................................................e.......................................................................|............................................................................................................................................................................. + // sub v24.4s, v10.4s, v11.4s // ......................................................e............................................................................................................|..........................................................................................................................................................................e.. + // add v10.4s, v10.4s, v11.4s // ...................................................e...............................................................................................................|.......................................................................................................................................................................e..... + // mul v11.4s, v24.4s, v4.s[0] // ..............................................................e....................................................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ...............................................................e...................................................................................................|............................................................................................................................................................................. + // mls v11.4s, v24.4s, v29.4s // .........................................................................e.........................................................................................|............................................................................................................................................................................. + // sub v24.4s, v12.4s, v13.4s // ...................e...............................................................................................................................................|.......................................................................................................................................e..................................... + // add v12.4s, v12.4s, v13.4s // ..................e................................................................................................................................................|......................................................................................................................................e...................................... + // mul v13.4s, v24.4s, v4.s[2] // ............................................e......................................................................................................................|................................................................................................................................................................e............ + // sqrdmulh v24.4s, v24.4s, v4.s[3] // .............................................e.....................................................................................................................|.................................................................................................................................................................e........... + // mls v13.4s, v24.4s, v29.4s // ..........................................................e........................................................................................................|............................................................................................................................................................................. + // sub v24.4s, v14.4s, v15.4s // ......................e............................................................................................................................................|..........................................................................................................................................e.................................. + // add v14.4s, v14.4s, v15.4s // ...........................e.......................................................................................................................................|...............................................................................................................................................e............................. + // mul v15.4s, v24.4s, v5.s[0] // .....................................e.............................................................................................................................|.........................................................................................................................................................e................... + // sqrdmulh v24.4s, v24.4s, v5.s[1] // .................................e.................................................................................................................................|.....................................................................................................................................................e....................... + // mls v15.4s, v24.4s, v29.4s // ....................................................e..............................................................................................................|........................................................................................................................................................................e.... + // sub v24.4s, v16.4s, v17.4s // ........................e..........................................................................................................................................|............................................................................................................................................e................................ + // add v16.4s, v16.4s, v17.4s // ..........................e........................................................................................................................................|..............................................................................................................................................e.............................. + // mul v17.4s, v24.4s, v5.s[2] // ........................................e..........................................................................................................................|............................................................................................................................................................e................ + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...........................................e.......................................................................................................................|...............................................................................................................................................................e............. + // mls v17.4s, v24.4s, v29.4s // ...........................................................e.......................................................................................................|............................................................................................................................................................................. + // sub v24.4s, v18.4s, v19.4s // .................................................e.................................................................................................................|.....................................................................................................................................................................e....... + // add v18.4s, v18.4s, v19.4s // ...............................................e...................................................................................................................|...................................................................................................................................................................e......... + // mul v19.4s, v24.4s, v6.s[0] // .........................................................e.........................................................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.s[1] // ............................................................e......................................................................................................|............................................................................................................................................................................. + // mls v19.4s, v24.4s, v29.4s // ......................................................................e............................................................................................|............................................................................................................................................................................. + // sub v24.4s, v20.4s, v21.4s // ...........................................................................e.......................................................................................|............................................................................................................................................................................. + // add v20.4s, v20.4s, v21.4s // ..........................................................................e........................................................................................|............................................................................................................................................................................. + // mul v21.4s, v24.4s, v6.s[2] // ...................................................................................e...............................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ....................................................................................e..............................................................................|............................................................................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // .............................................................................................e.....................................................................|............................................................................................................................................................................. + // sub v24.4s, v22.4s, v23.4s // ..............................................................................e....................................................................................|............................................................................................................................................................................. + // add v22.4s, v22.4s, v23.4s // .............................................................................e.....................................................................................|............................................................................................................................................................................. + // mul v23.4s, v24.4s, v7.s[0] // ...............................................................................................e...................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ................................................................................................e..................................................................|............................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ......................................................................................................e............................................................|............................................................................................................................................................................. + // sub v24.4s, v8.4s, v10.4s // ............................................................................e......................................................................................|............................................................................................................................................................................. + // add v8.4s, v8.4s, v10.4s // ........................................................................e..........................................................................................|............................................................................................................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // .........................................................................................e.........................................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..........................................................................................e........................................................................|............................................................................................................................................................................. + // mls v10.4s, v24.4s, v29.4s // ..................................................................................................e................................................................|............................................................................................................................................................................. + // sub v24.4s, v9.4s, v11.4s // ....................................................................................................e..............................................................|............................................................................................................................................................................. + // add v9.4s, v9.4s, v11.4s // ...................................................................................................e...............................................................|............................................................................................................................................................................. + // mul v11.4s, v24.4s, v1.s[2] // .......................................................................................................e...........................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ........................................................................................................e..........................................................|............................................................................................................................................................................. + // mls v11.4s, v24.4s, v29.4s // ...........................................................................................................e.......................................................|............................................................................................................................................................................. + // sub v24.4s, v12.4s, v14.4s // ..........................................e........................................................................................................................|..............................................................................................................................................................e.............. + // add v12.4s, v12.4s, v14.4s // .......................................e...........................................................................................................................|...........................................................................................................................................................e................. + // mul v14.4s, v24.4s, v2.s[0] // ..................................................e................................................................................................................|......................................................................................................................................................................e...... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .....................................................e.............................................................................................................|.........................................................................................................................................................................e... + // mls v14.4s, v24.4s, v29.4s // ....................................................................e..............................................................................................|............................................................................................................................................................................. + // sub v24.4s, v13.4s, v15.4s // ..............................................................................................e....................................................................|............................................................................................................................................................................. + // add v13.4s, v13.4s, v15.4s // ............................................................................................e......................................................................|............................................................................................................................................................................. + // mul v15.4s, v24.4s, v2.s[0] // ............................................................................................................................e......................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...........................................................................................................................e.......................................|............................................................................................................................................................................. + // mls v15.4s, v24.4s, v29.4s // ..................................................................................................................................e................................|............................................................................................................................................................................. + // sub v24.4s, v16.4s, v18.4s // .....................................................................e.............................................................................................|............................................................................................................................................................................. + // add v16.4s, v16.4s, v18.4s // .......................................................................e...........................................................................................|............................................................................................................................................................................. + // mul v18.4s, v24.4s, v2.s[2] // ..........................................................................................................e........................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .........................................................................................................e.........................................................|............................................................................................................................................................................. + // mls v18.4s, v24.4s, v29.4s // ............................................................................................................e......................................................|............................................................................................................................................................................. + // sub v24.4s, v17.4s, v19.4s // .....................................................................................................................................e.............................|............................................................................................................................................................................. + // add v17.4s, v17.4s, v19.4s // .........................................................................................................................................e.........................|............................................................................................................................................................................. + // mul v19.4s, v24.4s, v2.s[2] // .................................................................................................................................................e.................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..................................................................................................................................................e................|............................................................................................................................................................................. + // mls v19.4s, v24.4s, v29.4s // .......................................................................................................................................................e...........|............................................................................................................................................................................. + // sub v24.4s, v20.4s, v22.4s // ...............................................................................e...................................................................................|............................................................................................................................................................................. + // add v20.4s, v20.4s, v22.4s // ................................................................................e..................................................................................|............................................................................................................................................................................. + // mul v22.4s, v24.4s, v3.s[0] // .....................................................................................................e.............................................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .................................................................................................e.................................................................|............................................................................................................................................................................. + // mls v22.4s, v24.4s, v29.4s // ................................................................................................................e..................................................|............................................................................................................................................................................. + // sub v24.4s, v21.4s, v23.4s // ..............................................................................................................e....................................................|............................................................................................................................................................................. + // add v21.4s, v21.4s, v23.4s // .............................................................................................................e.....................................................|............................................................................................................................................................................. + // mul v23.4s, v24.4s, v3.s[0] // .......................................................................................................................................e...........................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................................................e..........................|............................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ...................................................................................................................................................e...............|............................................................................................................................................................................. + // sub v24.4s, v8.4s, v12.4s // .....................................................................................e.............................................................................|............................................................................................................................................................................. + // add v8.4s, v8.4s, v12.4s // ......................................................................................e............................................................................|............................................................................................................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ....................................................................................................................................e..............................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................................e...............................|............................................................................................................................................................................. + // mls v12.4s, v24.4s, v29.4s // ............................................................................................................................................e......................|............................................................................................................................................................................. + // sub v24.4s, v9.4s, v13.4s // ......................................................................................................................e............................................|............................................................................................................................................................................. + // add v9.4s, v9.4s, v13.4s // .........................................................................................................................e.........................................|............................................................................................................................................................................. + // mul v13.4s, v24.4s, v0.s[2] // ....................................................................................................................................................e..............|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................................................................................e.............|............................................................................................................................................................................. + // mls v13.4s, v24.4s, v29.4s // ...........................................................................................................................................................e.......|............................................................................................................................................................................. + // sub v24.4s, v10.4s, v14.4s // ...................................................................................................................e...............................................|............................................................................................................................................................................. + // add v10.4s, v10.4s, v14.4s // ..................................................................................................................e................................................|............................................................................................................................................................................. + // mul v14.4s, v24.4s, v0.s[2] // ..........................................................................................................................e........................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................e.....................................|............................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // .................................................................................................................................e.................................|............................................................................................................................................................................. + // sub v24.4s, v11.4s, v15.4s // .............................................................................................................................................e.....................|............................................................................................................................................................................. + // add v11.4s, v11.4s, v15.4s // ..............................................................................................................................................e....................|............................................................................................................................................................................. + // mul v15.4s, v24.4s, v0.s[2] // ...................................................................................................................................................................|....*........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................................................................|.....*....................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...............*............................................................................................................................................................. + // sub v24.4s, v16.4s, v20.4s // .......................................................................................e...........................................................................|............................................................................................................................................................................. + // add v16.4s, v16.4s, v20.4s // ........................................................................................e..........................................................................|............................................................................................................................................................................. + // mul v20.4s, v24.4s, v1.s[0] // ...............................................................................................................................e...................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................................................................................................................................e..................................|............................................................................................................................................................................. + // mls v20.4s, v24.4s, v29.4s // ......................................................................................................................................e............................|............................................................................................................................................................................. + // sub v24.4s, v17.4s, v21.4s // ................................................................................................................................................e..................|............................................................................................................................................................................. + // add v17.4s, v17.4s, v21.4s // ...............................................................................................................................................e...................|............................................................................................................................................................................. + // mul v21.4s, v24.4s, v1.s[0] // .........................................................................................................................................................e.........|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................................................................................................................e..........|............................................................................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // ...............................................................................................................................................................e...|............................................................................................................................................................................. + // sub v24.4s, v18.4s, v22.4s // ........................................................................................................................e..........................................|............................................................................................................................................................................. + // add v18.4s, v18.4s, v22.4s // .......................................................................................................................e...........................................|............................................................................................................................................................................. + // mul v22.4s, v24.4s, v1.s[0] // ..........................................................................................................................................e........................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................................................................................................................e.......................|............................................................................................................................................................................. + // mls v22.4s, v24.4s, v29.4s // ......................................................................................................................................................e............|............................................................................................................................................................................. + // sub v24.4s, v19.4s, v23.4s // ..................................................................................................................................................................e|............................................................................................................................................................................. + // add v19.4s, v19.4s, v23.4s // .................................................................................................................................................................e.|............................................................................................................................................................................. + // mul v23.4s, v24.4s, v1.s[0] // ...................................................................................................................................................................|.............*............................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................................................|................*............................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ...................................................................................................................................................................|.........................*................................................................................................................................................... + // sub v24.4s, v8.4s, v16.4s // ...............................................................................................................e...................................................|............................................................................................................................................................................. + // add v8.4s, v8.4s, v16.4s // .................................................................................................................e.................................................|............................................................................................................................................................................. + // mul v16.4s, v24.4s, v0.s[0] // ....................................................................................................................e..............................................|............................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................................................................................e.............................................|............................................................................................................................................................................. + // mls v16.4s, v24.4s, v29.4s // ..............................................................................................................................e....................................|............................................................................................................................................................................. + // sub v24.4s, v9.4s, v17.4s // ...................................................................................................................................................................|*............................................................................................................................................................................ + // add v9.4s, v9.4s, v17.4s // ...................................................................................................................................................................|...*......................................................................................................................................................................... + // mul v17.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|........................*.................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.......................*..................................................................................................................................................... + // mls v17.4s, v24.4s, v29.4s // ...................................................................................................................................................................|............................*................................................................................................................................................ + // sub v24.4s, v10.4s, v18.4s // .............................................................................................................................................................e.....|............................................................................................................................................................................. + // add v10.4s, v10.4s, v18.4s // ............................................................................................................................................................e......|............................................................................................................................................................................. + // mul v18.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|.....................*....................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|......................*...................................................................................................................................................... + // mls v18.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...............................*............................................................................................................................................. + // sub v24.4s, v11.4s, v19.4s // ...................................................................................................................................................................|.................*........................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ...................................................................................................................................................................|..................*.......................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|....................................*........................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.....................................*....................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ...................................................................................................................................................................|............................................*................................................................................................................................ + // sub v24.4s, v12.4s, v20.4s // ...................................................................................................................................................................|.*........................................................................................................................................................................... + // add v12.4s, v12.4s, v20.4s // ...................................................................................................................................................................|..*.......................................................................................................................................................................... + // mul v20.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|........*.................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.........*................................................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...................*......................................................................................................................................................... + // sub v24.4s, v13.4s, v21.4s // ...................................................................................................................................................................|......*...................................................................................................................................................................... + // add v13.4s, v13.4s, v21.4s // ...................................................................................................................................................................|.......*..................................................................................................................................................................... + // mul v21.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|....................*........................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|..............*.............................................................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...........................*................................................................................................................................................. + // sub v24.4s, v14.4s, v22.4s // ...................................................................................................................................................................|..........*.................................................................................................................................................................. + // add v14.4s, v14.4s, v22.4s // ...................................................................................................................................................................|...........*................................................................................................................................................................. + // mul v22.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|..........................*.................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|................................*............................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ...................................................................................................................................................................|..........................................*.................................................................................................................................. + // sub v24.4s, v15.4s, v23.4s // ...................................................................................................................................................................|..................................*.......................................................................................................................................... + // add v15.4s, v15.4s, v23.4s // ...................................................................................................................................................................|.................................*........................................................................................................................................... + // mul v23.4s, v24.4s, v0.s[0] // ...................................................................................................................................................................|......................................*...................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................|.......................................*..................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // ...................................................................................................................................................................|...............................................*............................................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // ..........................................................................................................................................................e........|............................................................................................................................................................................. + // cmge v28.4s, v16.4s, v30.4s // ..............................................................................................................................................................e....|............................................................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ................................................................................................................................................................e..|............................................................................................................................................................................. + // mls v16.4s, v28.4s, v29.4s // ...................................................................................................................................................................*............................................................................................................................................................................. + // cmge v27.4s, v31.4s, v17.4s // ...................................................................................................................................................................|.................................................*........................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ...................................................................................................................................................................|..................................................*.......................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|.....................................................*....................................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ...................................................................................................................................................................|...........................................................*................................................................................................................. + // cmge v27.4s, v31.4s, v18.4s // ...................................................................................................................................................................|...........................................*................................................................................................................................. + // cmge v28.4s, v18.4s, v30.4s // ...................................................................................................................................................................|.........................................*................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|................................................*............................................................................................................................ + // mls v18.4s, v28.4s, v29.4s // ...................................................................................................................................................................|.......................................................*..................................................................................................................... + // cmge v27.4s, v31.4s, v19.4s // ...................................................................................................................................................................|..........................................................*.................................................................................................................. + // cmge v28.4s, v19.4s, v30.4s // ...................................................................................................................................................................|......................................................*...................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|.............................................................*............................................................................................................... + // mls v19.4s, v28.4s, v29.4s // ...................................................................................................................................................................|.........................................................................*................................................................................................... + // cmge v27.4s, v31.4s, v20.4s // ...................................................................................................................................................................|.............................*............................................................................................................................................... + // cmge v28.4s, v20.4s, v30.4s // ...................................................................................................................................................................|..............................*.............................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...................................*......................................................................................................................................... + // mls v20.4s, v28.4s, v29.4s // ...................................................................................................................................................................|........................................*.................................................................................................................................... + // cmge v27.4s, v31.4s, v21.4s // ...................................................................................................................................................................|.......................................................................................*..................................................................................... + // cmge v28.4s, v21.4s, v30.4s // ...................................................................................................................................................................|..............................................................................*.............................................................................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...........................................................................................*................................................................................. + // mls v21.4s, v28.4s, v29.4s // ...................................................................................................................................................................|......................................................................................................*...................................................................... + // cmge v27.4s, v31.4s, v22.4s // ...................................................................................................................................................................|....................................................................*........................................................................................................ + // cmge v28.4s, v22.4s, v30.4s // ...................................................................................................................................................................|...................................................................*......................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|............................................................................*................................................................................................ + // mls v22.4s, v28.4s, v29.4s // ...................................................................................................................................................................|....................................................................................*........................................................................................ + // cmge v27.4s, v31.4s, v23.4s // ...................................................................................................................................................................|........................................................*.................................................................................................................... + // cmge v28.4s, v23.4s, v30.4s // ...................................................................................................................................................................|.........................................................*................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...............................................................*............................................................................................................. + // mls v23.4s, v28.4s, v29.4s // ...................................................................................................................................................................|.....................................................................*....................................................................................................... + // str q16, [x1, #(8*(512/8))] // ...................................................................................................................................................................|............*................................................................................................................................................................ + // str q17, [x1, #(9*(512/8))] // ...................................................................................................................................................................|.......................................................................*..................................................................................................... + // str q18, [x1, #(10*(512/8))] // ...................................................................................................................................................................|.................................................................*........................................................................................................... + // str q19, [x1, #(11*(512/8))] // ...................................................................................................................................................................|...................................................................................*......................................................................................... + // str q20, [x1, #(12*(512/8))] // ...................................................................................................................................................................|...................................................*......................................................................................................................... + // str q21, [x1, #(13*(512/8))] // ...................................................................................................................................................................|.................................................................................................................*........................................................... + // str q22, [x1, #(14*(512/8))] // ...................................................................................................................................................................|.................................................................................................*........................................................................... + // str q23, [x1, #(15*(512/8))] // ...................................................................................................................................................................|..................................................................................*.......................................................................................... + // mul v16.4s, v8.4s, v25.4s // ...................................................................................................................................................................|.............................................*............................................................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // ...................................................................................................................................................................|..............................................*.............................................................................................................................. + // mls v16.4s, v8.4s, v29.4s // ...................................................................................................................................................................|....................................................*........................................................................................................................ + // mul v17.4s, v9.4s, v25.4s // ...................................................................................................................................................................|........................................................................*.................................................................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................................................................................................................................|..........................................................................*.................................................................................................. + // mls v17.4s, v9.4s, v29.4s // ...................................................................................................................................................................|....................................................................................................*........................................................................ + // mul v18.4s, v10.4s, v25.4s // ...................................................................................................................................................................|.............................................................................*............................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ...................................................................................................................................................................|......................................................................*...................................................................................................... + // mls v18.4s, v10.4s, v29.4s // ...................................................................................................................................................................|.............................................................................................*............................................................................... + // mul v19.4s, v11.4s, v25.4s // ...................................................................................................................................................................|........................................................................................*.................................................................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ...................................................................................................................................................................|...............................................................................*............................................................................................. + // mls v19.4s, v11.4s, v29.4s // ...................................................................................................................................................................|.......................................................................................................*..................................................................... + // mul v20.4s, v12.4s, v25.4s // ...................................................................................................................................................................|................................................................*............................................................................................................ + // sqrdmulh v12.4s, v12.4s, v26.4s // ...................................................................................................................................................................|..................................................................*.......................................................................................................... + // mls v20.4s, v12.4s, v29.4s // ...................................................................................................................................................................|.................................................................................*........................................................................................... + // mul v21.4s, v13.4s, v25.4s // ...................................................................................................................................................................|............................................................*................................................................................................................ + // sqrdmulh v13.4s, v13.4s, v26.4s // ...................................................................................................................................................................|..............................................................*.............................................................................................................. + // mls v21.4s, v13.4s, v29.4s // ...................................................................................................................................................................|...........................................................................*................................................................................................. + // mul v22.4s, v14.4s, v25.4s // ...................................................................................................................................................................|..................................................................................................*.......................................................................... + // sqrdmulh v14.4s, v14.4s, v26.4s // ...................................................................................................................................................................|..............................................................................................*.............................................................................. + // mls v22.4s, v14.4s, v29.4s // ...................................................................................................................................................................|..............................................................................................................*.............................................................. + // mul v23.4s, v15.4s, v25.4s // ...................................................................................................................................................................|.....................................................................................*....................................................................................... + // sqrdmulh v15.4s, v15.4s, v26.4s // ...................................................................................................................................................................|................................................................................*............................................................................................ + // mls v23.4s, v15.4s, v29.4s // ...................................................................................................................................................................|...................................................................................................*......................................................................... + // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................................................................................|..........................................................................................*.................................................................................. + // cmge v28.4s, v16.4s, v30.4s // ...................................................................................................................................................................|......................................................................................*...................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...............................................................................................*............................................................................. + // mls v16.4s, v28.4s, v29.4s // ...................................................................................................................................................................|..........................................................................................................*.................................................................. + // cmge v27.4s, v31.4s, v17.4s // ...................................................................................................................................................................|...................................................................................................................*......................................................... + // cmge v28.4s, v17.4s, v30.4s // .*.................................................................................................................................................................|.....................................................................................................................*....................................................... + // sub v28.4s, v27.4s, v28.4s // ...........*.......................................................................................................................................................|...............................................................................................................................*............................................. + // mls v17.4s, v28.4s, v29.4s // .................*.................................................................................................................................................|.....................................................................................................................................*....................................... + // cmge v27.4s, v31.4s, v18.4s // ...................................................................................................................................................................|...........................................................................................................*................................................................. + // cmge v28.4s, v18.4s, v30.4s // ...................................................................................................................................................................|........................................................................................................*.................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|................................................................................................................*............................................................ + // mls v18.4s, v28.4s, v29.4s // ..........*........................................................................................................................................................|..............................................................................................................................*.............................................. + // cmge v27.4s, v31.4s, v19.4s // .............*.....................................................................................................................................................|.................................................................................................................................*........................................... + // cmge v28.4s, v19.4s, v30.4s // ........*..........................................................................................................................................................|............................................................................................................................*................................................ + // sub v28.4s, v27.4s, v28.4s // ....................*..............................................................................................................................................|........................................................................................................................................*.................................... + // mls v19.4s, v28.4s, v29.4s // ................................*..................................................................................................................................|....................................................................................................................................................*........................ + // cmge v27.4s, v31.4s, v20.4s // ...................................................................................................................................................................|............................................................................................*................................................................................ + // cmge v28.4s, v20.4s, v30.4s // ...................................................................................................................................................................|.........................................................................................*................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|.....................................................................................................*....................................................................... + // mls v20.4s, v28.4s, v29.4s // ...*...............................................................................................................................................................|.......................................................................................................................*..................................................... + // cmge v27.4s, v31.4s, v21.4s // ...................................................................................................................................................................|.........................................................................................................*................................................................... + // cmge v28.4s, v21.4s, v30.4s // ...................................................................................................................................................................|................................................................................................*............................................................................ + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................................................................|...............................................................................................................*............................................................. + // mls v21.4s, v28.4s, v29.4s // ................*..................................................................................................................................................|....................................................................................................................................*........................................ + // cmge v27.4s, v31.4s, v22.4s // ............*......................................................................................................................................................|................................................................................................................................*............................................ + // cmge v28.4s, v22.4s, v30.4s // ......*............................................................................................................................................................|..........................................................................................................................*.................................................. + // sub v28.4s, v27.4s, v28.4s // .......................*...........................................................................................................................................|...........................................................................................................................................*................................. + // mls v22.4s, v28.4s, v29.4s // ..............................................*....................................................................................................................|..................................................................................................................................................................*.......... + // cmge v27.4s, v31.4s, v23.4s // ...................................................................................................................................................................|.............................................................................................................*............................................................... + // cmge v28.4s, v23.4s, v30.4s // ...................................................................................................................................................................|............................................................................................................*................................................................ + // sub v28.4s, v27.4s, v28.4s // ....*..............................................................................................................................................................|........................................................................................................................*.................................................... + // mls v23.4s, v28.4s, v29.4s // ..............*....................................................................................................................................................|..................................................................................................................................*.......................................... + // str q16, [x1], #(16) // ...................................................................................................................................................................|..................................................................................................................*.......................................................... + // str q17, [x1, #(-16 + 1*(512/8))] // ..................................*................................................................................................................................|......................................................................................................................................................*...................... + // str q18, [x1, #(-16 + 2*(512/8))] // .....................*.............................................................................................................................................|.........................................................................................................................................*................................... + // str q19, [x1, #(-16 + 3*(512/8))] // ................................................*..................................................................................................................|....................................................................................................................................................................*........ + // str q20, [x1, #(-16 + 4*(512/8))] // .........................*.........................................................................................................................................|.............................................................................................................................................*............................... + // str q21, [x1, #(-16 + 5*(512/8))] // ......................................*............................................................................................................................|..........................................................................................................................................................*.................. + // str q22, [x1, #(-16 + 6*(512/8))] // ........................................................*..........................................................................................................|............................................................................................................................................................................* + // str q23, [x1, #(-16 + 7*(512/8))] // .........................................*.........................................................................................................................|.............................................................................................................................................................*............... + + sub count, count, #1 + cbnz count, layer1234_start + mls v16.4S, v17.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v17.4S, v22.4S, v18.4S // .............................................................................................................................................*.......................................................................................................................................... + sub v28.4S, v9.4S, v12.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v9.4S, v12.4S // .............................................................................................................................................................*.......................................................................................................................... + add v9.4S, v22.4S, v18.4S // ..............................................................................................................................................*......................................................................................................................................... + mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + sub v20.4S, v11.4S, v27.4S // .......................................................................................................................................................*................................................................................................................................ + sub v22.4S, v24.4S, v14.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v24.4S, v14.4S // .......................................................................................................................................................................*................................................................................................................ + sqrdmulh v24.4S, v17.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + mul v17.4S, v17.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v16.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + add v11.4S, v11.4S, v27.4S // ........................................................................................................................................................*............................................................................................................................... + sqrdmulh v13.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + sqrdmulh v27.4S, v20.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v18.4S, v19.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mul v19.4S, v20.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v20.4S, v23.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mul v23.4S, v23.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mls v17.4S, v24.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sqrdmulh v24.4S, v22.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v22.4S, v22.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mls v16.4S, v13.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + add v13.4S, v15.4S, v21.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v21.4S, v15.4S, v21.4S // .................................................................................................................................................................*...................................................................................................................... + mls v19.4S, v27.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mls v23.4S, v20.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mls v22.4S, v24.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + sqrdmulh v24.4S, v28.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + mul v28.4S, v28.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + mul v27.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sqrdmulh v20.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + cmge v15.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + cmge v21.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + mls v28.4S, v24.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sub v24.4S, v21.4S, v15.4S // ......................................................................................................................................................................................*................................................................................................. + add v15.4S, v16.4S, v23.4S // ............................................................................................................................................................................*........................................................................................................... + sub v23.4S, v16.4S, v23.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v27.4S, v20.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v21.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + sqrdmulh v16.4S, v23.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mul v23.4S, v23.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sub v20.4S, v21.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + cmge v24.4S, v28.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + cmge v21.4S, v31.4S, v28.4S // ................................................................................................................................................................................................*....................................................................................... + mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + sub v24.4S, v21.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v20.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v17.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mls v23.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sub v17.4S, v20.4S, v17.4S // ..........................................................................................................................................................................................................*............................................................................. + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v21.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + cmge v18.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v20.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + cmge v8.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + mls v28.4S, v24.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + mls v22.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sub v20.4S, v18.4S, v20.4S // ..............................................................................................................................................................................................*......................................................................................... + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + cmge v24.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + mls v16.4S, v21.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sqrdmulh v10.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + str q28, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v28.4S, v24.4S, v8.4S // ..............................................................................................................................................................................................................*......................................................................... + sqrdmulh v24.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v13.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v19.4S, v20.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + cmge v8.4S, v27.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + sqrdmulh v9.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + mls v18.4S, v10.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sqrdmulh v14.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + cmge v10.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v28.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + cmge v11.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v15.4S, v31.4S, v27.4S // ....................................................................................................................................................................................................*................................................................................... + sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mls v17.4S, v13.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + mls v22.4S, v9.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + cmge v9.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + cmge v13.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + mls v21.4S, v24.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sub v10.4S, v10.4S, v11.4S // ..................................................................................................................................................................................................................................................*..................................... + mls v19.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v15.4S, v15.4S, v8.4S // ......................................................................................................................................................................................................*................................................................................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v24.4S, v13.4S, v9.4S // ..........................................................................................................................................................................................................................................................*............................. + cmge v14.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v9.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v16.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v13.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mls v27.4S, v15.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v15.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + cmge v11.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v12.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + str q27, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v27.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v16.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v10.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v8.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + sub v15.4S, v15.4S, v13.4S // ......................................................................................................................................................................................................................................................................*................. + sub v11.4S, v12.4S, v11.4S // ..............................................................................................................................................................................................................................................................*......................... + cmge v28.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + cmge v13.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + sub v14.4S, v14.4S, v9.4S // ......................................................................................................................................................................................................................................................*................................. + mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sub v9.4S, v27.4S, v16.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v21.4S, v15.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + mls v19.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + sub v16.4S, v10.4S, v28.4S // ..........................................................................................................................................................................................................................................................................*............. + sub v27.4S, v8.4S, v13.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v23.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + mls v17.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + mls v22.4S, v16.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + mls v20.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s new file mode 100644 index 00000000..e23094cc --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm.s @@ -0,0 +1,1710 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm + .global _intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm: +_intt_dilithium_1234_5678_manual_ld4_opt_m1_icestorm: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + // gap // ................................................. + ldr q6, [x0, #32] // ..*.............................................. + ldr q18, [x0, #48] // ...*............................................. + // gap // ................................................. + ldr q11, [x0, #16] // .*............................................... + ldr q5, [x0, #0] // *................................................ + // gap // ................................................. + // gap // ................................................. + ldr q4, [x3, #64] // ..........*...................................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + ldr q1, [x3, #80] // ....*............................................ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v12.4S, v6.4S, v18.4S // .......*......................................... + trn1 v7.4S, v6.4S, v18.4S // .........*....................................... + ldr q28, [x3, #48] // ........*........................................ + // gap // ................................................. + trn2 v3.4S, v5.4S, v11.4S // ......*.......................................... + trn1 v16.4S, v5.4S, v11.4S // .....*........................................... + ldr q5, [x3, #32] // ...........*..................................... + // gap // ................................................. + ldr q30, [x4], #8 // ...................................*............. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v27.2D, v16.2D, v7.2D // ............*.................................... + ldr q19, [x3, #16] // ....................*............................ + trn2 v14.2D, v3.2D, v12.2D // .............*................................... + // gap // ................................................. + trn1 v22.2D, v3.2D, v12.2D // ...............*................................. + trn1 v10.2D, v16.2D, v7.2D // ..............*.................................. + ldr q12, [x3], #(6*16) // .................*............................... + // gap // ................................................. + sub v2.4S, v27.4S, v14.4S // ................*................................ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v17.4S, v10.4S, v22.4S // ...................*............................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mul v21.4S, v2.4S, v4.4S // .......................*......................... + sqrdmulh v11.4S, v2.4S, v1.4S // ......................*.......................... + // gap // ................................................. + // gap // ................................................. + sqrdmulh v24.4S, v17.4S, v28.4S // ........................*........................ + mul v28.4S, v17.4S, v5.4S // .........................*....................... + ldr q17, [x4], #16 // ....................................*............ + // gap // ................................................. + add v8.4S, v27.4S, v14.4S // ..................*.............................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + add v9.4S, v10.4S, v22.4S // .....................*........................... + mls v21.4S, v11.4S, v29.4S // ..........................*...................... + // gap // ................................................. + // gap // ................................................. + mls v28.4S, v24.4S, v29.4S // ...........................*..................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v23.4S, v9.4S, v8.4S // ............................*.................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + add v9.4S, v9.4S, v8.4S // .....................................*........... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v26.4S, v28.4S, v21.4S // .............................*................... + sqrdmulh v6.4S, v23.4S, v19.4S // ..............................*.................. + // gap // ................................................. + // gap // ................................................. + mul v5.4S, v23.4S, v12.4S // ...............................*................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v2.4S, v26.4S, v19.4S // .................................*............... + mul v20.4S, v26.4S, v12.4S // ................................*................ + // gap // ................................................. + // gap // ................................................. + add v13.4S, v28.4S, v21.4S // ..................................*.............. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v5.4S, v6.4S, v29.4S // ......................................*.......... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v20.4S, v2.4S, v29.4S // .......................................*......... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn1 v21.4S, v9.4S, v13.4S // ........................................*........ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v2.4S, v9.4S, v13.4S // .........................................*....... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v22.4S, v5.4S, v20.4S // ...........................................*..... + trn1 v14.4S, v5.4S, v20.4S // ..........................................*...... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn1 v18.2D, v2.2D, v22.2D // ............................................*.... + trn1 v31.2D, v21.2D, v14.2D // .............................................*... + // gap // ................................................. + // gap // ................................................. + trn2 v5.2D, v2.2D, v22.2D // ...............................................*. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v22.2D, v21.2D, v14.2D // ..............................................*.. + sub v25.4S, v31.4S, v18.4S // ................................................* + // gap // ................................................. + // gap // ................................................. + + // original source code + // ldr q14, [x0, #0] // ...*............................................. + // ldr q15, [x0, #16] // ..*.............................................. + // ldr q18, [x0, #32] // *................................................ + // ldr q13, [x0, #48] // .*............................................... + // ldr q24, [x3, #80] // .....*........................................... + // trn1 v12.4S, v14.4S, v15.4S // ..........*...................................... + // trn2 v31.4S, v14.4S, v15.4S // .........*....................................... + // trn2 v15.4S, v18.4S, v13.4S // ......*.......................................... + // ldr q28, [x3, #48] // ........*........................................ + // trn1 v14.4S, v18.4S, v13.4S // .......*......................................... + // ldr q21, [x3, #64] // ....*............................................ + // ldr q23, [x3, #32] // ...........*..................................... + // trn2 v25.2D, v12.2D, v14.2D // .............*................................... + // trn2 v27.2D, v31.2D, v15.2D // ...............*................................. + // trn1 v19.2D, v12.2D, v14.2D // .................*............................... + // trn1 v1.2D, v31.2D, v15.2D // ................*................................ + // sub v11.4S, v25.4S, v27.4S // ...................*............................. + // ldr q20, [x3], #(6*16) // ..................*.............................. + // add v3.4S, v25.4S, v27.4S // ..........................*...................... + // sub v18.4S, v19.4S, v1.4S // ....................*............................ + // ldr q6, [x3, #-80] // ..............*.................................. + // add v7.4S, v19.4S, v1.4S // ...........................*..................... + // sqrdmulh v31.4S, v11.4S, v24.4S // ......................*.......................... + // mul v19.4S, v11.4S, v21.4S // .....................*........................... + // sqrdmulh v28.4S, v18.4S, v28.4S // .......................*......................... + // mul v14.4S, v18.4S, v23.4S // ........................*........................ + // mls v19.4S, v31.4S, v29.4S // ............................*.................... + // mls v14.4S, v28.4S, v29.4S // .............................*................... + // sub v16.4S, v7.4S, v3.4S // ..............................*.................. + // sub v31.4S, v14.4S, v19.4S // ................................*................ + // sqrdmulh v25.4S, v16.4S, v6.4S // .................................*............... + // mul v5.4S, v16.4S, v20.4S // ..................................*.............. + // mul v8.4S, v31.4S, v20.4S // ....................................*............ + // sqrdmulh v1.4S, v31.4S, v6.4S // ...................................*............. + // add v31.4S, v14.4S, v19.4S // .....................................*........... + // ldr q30, [x4], #8 // ............*.................................... + // ldr q17, [x4], #16 // .........................*....................... + // add v6.4S, v7.4S, v3.4S // ...............................*................. + // mls v5.4S, v25.4S, v29.4S // ......................................*.......... + // mls v8.4S, v1.4S, v29.4S // .......................................*......... + // trn1 v7.4S, v6.4S, v31.4S // ........................................*........ + // trn2 v4.4S, v6.4S, v31.4S // .........................................*....... + // trn1 v9.4S, v5.4S, v8.4S // ...........................................*..... + // trn2 v14.4S, v5.4S, v8.4S // ..........................................*...... + // trn1 v18.2D, v4.2D, v14.2D // ............................................*.... + // trn1 v31.2D, v7.2D, v9.2D // .............................................*... + // trn2 v22.2D, v7.2D, v9.2D // ...............................................*. + // trn2 v5.2D, v4.2D, v14.2D // ..............................................*.. + // sub v25.4S, v31.4S, v18.4S // ................................................* + + sub count, count, #1 +layer5678_start: + ldr q14, [x0, #64] // e........................................................................... + add v10.4S, v31.4S, v18.4S // .................................................*.......................... + ldr q15, [x0, #80] // .e.......................................................................... + sub v0.4S, v22.4S, v5.4S // .....................................................*...................... + ldr q18, [x0, #96] // ..e......................................................................... + sqrdmulh v2.4S, v25.4S, v17.S[1] // ...................................................*........................ + add v5.4S, v22.4S, v5.4S // ......................................................*..................... + ldr q13, [x0, #112] // ...e........................................................................ + mul v8.4S, v25.4S, v17.S[0] // ..................................................*......................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v19.4S, v0.4S, v17.S[3] // ........................................................*................... + mul v9.4S, v0.4S, v17.S[2] // .......................................................*.................... + add v0.4S, v10.4S, v5.4S // ...........................................................*................ + // gap // ............................................................................ + ldr q24, [x3, #80] // .................e.......................................................... + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v12.4S, v14.4S, v15.4S // ....e....................................................................... + trn2 v31.4S, v14.4S, v15.4S // .....e...................................................................... + trn2 v15.4S, v18.4S, v13.4S // .......e.................................................................... + // gap // ............................................................................ + ldr q28, [x3, #48] // ...............e............................................................ + trn1 v14.4S, v18.4S, v13.4S // ......e..................................................................... + mls v9.4S, v19.4S, v29.4S // .........................................................*.................. + ldr q21, [x3, #64] // ................e........................................................... + ldr q23, [x3, #32] // ..............e............................................................. + mls v8.4S, v2.4S, v29.4S // ....................................................*....................... + // gap // ............................................................................ + trn2 v25.2D, v12.2D, v14.2D // ........e................................................................... + // gap // ............................................................................ + trn2 v27.2D, v31.2D, v15.2D // .........e.................................................................. + // gap // ............................................................................ + trn1 v19.2D, v12.2D, v14.2D // ..........e................................................................. + trn1 v1.2D, v31.2D, v15.2D // ...........e................................................................ + // gap // ............................................................................ + sub v11.4S, v25.4S, v27.4S // .......................e.................................................... + // gap // ............................................................................ + ldr q20, [x3], #(6*16) // ............e............................................................... + add v3.4S, v25.4S, v27.4S // ........................e................................................... + sub v18.4S, v19.4S, v1.4S // ..................e......................................................... + ldr q6, [x3, #-80] // .............e.............................................................. + add v7.4S, v19.4S, v1.4S // ...................e........................................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v31.4S, v11.4S, v24.4S // ..........................e................................................. + mul v19.4S, v11.4S, v21.4S // .........................e.................................................. + // gap // ............................................................................ + sqrdmulh v28.4S, v18.4S, v28.4S // .....................e...................................................... + mul v14.4S, v18.4S, v23.4S // ....................e....................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v2.4S, v8.4S, v9.4S // ...............................................................*............ + // gap // ............................................................................ + // gap // ............................................................................ + sub v15.4S, v10.4S, v5.4S // ..........................................................*................. + // gap // ............................................................................ + // gap // ............................................................................ + mls v19.4S, v31.4S, v29.4S // ...........................e................................................ + add v12.4S, v8.4S, v9.4S // ................................................................*........... + mls v14.4S, v28.4S, v29.4S // ......................e..................................................... + // gap // ............................................................................ + // gap // ............................................................................ + sub v16.4S, v7.4S, v3.4S // ............................e............................................... + srshr v9.4S, v12.4S, #23 // ......................................................................*..... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v27.4S, v15.4S, v30.S[1] // .............................................................*.............. + // gap // ............................................................................ + // gap // ............................................................................ + srshr v18.4S, v0.4S, #23 // ....................................................................*....... + mul v13.4S, v2.4S, v30.S[0] // .................................................................*.......... + // gap // ............................................................................ + // gap // ............................................................................ + sub v31.4S, v14.4S, v19.4S // .................................e.......................................... + sqrdmulh v25.4S, v16.4S, v6.4S // ...............................e............................................ + mls v12.4S, v9.4S, v29.4S // .......................................................................*.... + // gap // ............................................................................ + // gap // ............................................................................ + mul v5.4S, v16.4S, v20.4S // ..............................e............................................. + mul v8.4S, v31.4S, v20.4S // ...................................e........................................ + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v1.4S, v31.4S, v6.4S // ....................................e....................................... + add v31.4S, v14.4S, v19.4S // ..................................e......................................... + // gap // ............................................................................ + // gap // ............................................................................ + sqrdmulh v2.4S, v2.4S, v30.S[1] // ..................................................................*......... + mul v19.4S, v15.4S, v30.S[0] // ............................................................*............... + ldr q30, [x4], #8 // ..............................................e............................. + ldr q17, [x4], #16 // ...............................................e............................ + add v6.4S, v7.4S, v3.4S // .............................e.............................................. + mls v5.4S, v25.4S, v29.4S // ................................e........................................... + // gap // ............................................................................ + // gap // ............................................................................ + mls v8.4S, v1.4S, v29.4S // .....................................e...................................... + // gap // ............................................................................ + mls v0.4S, v18.4S, v29.4S // .....................................................................*...... + // gap // ............................................................................ + mls v13.4S, v2.4S, v29.4S // ...................................................................*........ + trn1 v7.4S, v6.4S, v31.4S // ......................................e..................................... + trn2 v4.4S, v6.4S, v31.4S // .......................................e.................................... + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v9.4S, v5.4S, v8.4S // ........................................e................................... + trn2 v14.4S, v5.4S, v8.4S // .........................................e.................................. + // gap // ............................................................................ + // gap // ............................................................................ + str q0, [x0], #(16*4) // ........................................................................*... + mls v19.4S, v27.4S, v29.4S // ..............................................................*............. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + trn1 v18.2D, v4.2D, v14.2D // .............................................e.............................. + trn1 v31.2D, v7.2D, v9.2D // ............................................e............................... + str q12, [x0, #-48] // .........................................................................*.. + trn2 v22.2D, v7.2D, v9.2D // ..........................................e................................. + trn2 v5.2D, v4.2D, v14.2D // ...........................................e................................ + str q13, [x0, #-16] // ...........................................................................* + // gap // ............................................................................ + sub v25.4S, v31.4S, v18.4S // ................................................e........................... + str q19, [x0, #-32] // ..........................................................................*. + // gap // ............................................................................ + // gap // ............................................................................ + + // original source code + // ldr q8, [x0, #(16*0)] // e...........................................................................e........................................................................... + // ldr q9, [x0, #(16*1)] // ..e.........................................................................|.e......................................................................... + // ldr q10, [x0, #(16*2)] // ....e.......................................................................|...e....................................................................... + // ldr q11, [x0, #(16*3)] // .......e....................................................................|......e.................................................................... + // trn1 v25.4s, v8.4s, v9.4s // .............e..............................................................|............e.............................................................. + // trn2 v26.4s, v8.4s, v9.4s // ..............e.............................................................|.............e............................................................. + // trn1 v27.4s, v10.4s, v11.4s // .................e..........................................................|................e.......................................................... + // trn2 v28.4s, v10.4s, v11.4s // ...............e............................................................|..............e............................................................ + // trn2 v10.2d, v25.2d, v27.2d // ......................e.....................................................|.....................e..................................................... + // trn2 v11.2d, v26.2d, v28.2d // .......................e....................................................|......................e.................................................... + // trn1 v8.2d, v25.2d, v27.2d // ........................e...................................................|.......................e................................................... + // trn1 v9.2d, v26.2d, v28.2d // .........................e..................................................|........................e.................................................. + // ldr q0, [x3], #(6*16) // ...........................e................................................|..........................e................................................ + // ldr q4, [x3, #(-6*16 + 1*16)] // ..............................e.............................................|.............................e............................................. + // ldr q1, [x3, #(-6*16 + 2*16)] // ....................e.......................................................|...................e....................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ................e...........................................................|...............e........................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // ...................e........................................................|..................e........................................................ + // ldr q6, [x3, #(-6*16 + 5*16)] // ............e...............................................................|...........e............................................................... + // sub v24.4s, v8.4s, v9.4s // .............................e..............................................|............................e.............................................. + // add v8.4s, v8.4s, v9.4s // ...............................e............................................|..............................e............................................ + // mul v9.4s, v24.4s, v1.4s // ...................................e........................................|..................................e........................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ..................................e.........................................|.................................e......................................... + // mls v9.4s, v24.4s, v29.4s // ........................................e...................................|.......................................e................................... + // sub v24.4s, v10.4s, v11.4s // ..........................e.................................................|.........................e................................................. + // add v10.4s, v10.4s, v11.4s // ............................e...............................................|...........................e............................................... + // mul v11.4s, v24.4s, v2.4s // .................................e..........................................|................................e.......................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ................................e...........................................|...............................e........................................... + // mls v11.4s, v24.4s, v29.4s // ......................................e.....................................|.....................................e..................................... + // sub v24.4s, v8.4s, v10.4s // .........................................e..................................|........................................e.................................. + // add v8.4s, v8.4s, v10.4s // .........................................................e..................|........................................................e.................. + // mul v10.4s, v24.4s, v0.4s // .................................................e..........................|................................................e.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................e............................|..............................................e............................ + // mls v10.4s, v24.4s, v29.4s // ..........................................................e.................|.........................................................e................. + // sub v24.4s, v9.4s, v11.4s // ..............................................e.............................|.............................................e............................. + // add v9.4s, v9.4s, v11.4s // ....................................................e.......................|...................................................e....................... + // mul v11.4s, v24.4s, v0.4s // ..................................................e.........................|.................................................e......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................e........................|..................................................e........................ + // mls v11.4s, v24.4s, v29.4s // ...........................................................e................|..........................................................e................ + // trn1 v25.4s, v8.4s, v9.4s // ..............................................................e.............|.............................................................e............. + // trn2 v26.4s, v8.4s, v9.4s // ...............................................................e............|..............................................................e............ + // trn1 v27.4s, v10.4s, v11.4s // ................................................................e...........|...............................................................e........... + // trn2 v28.4s, v10.4s, v11.4s // .................................................................e..........|................................................................e.......... + // trn2 v10.2d, v25.2d, v27.2d // .......................................................................e....|......................................................................e.... + // trn2 v11.2d, v26.2d, v28.2d // ........................................................................e...|.......................................................................e... + // trn1 v8.2d, v25.2d, v27.2d // .....................................................................e......|....................................................................e...... + // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e.......|...................................................................e....... + // ldr q1, [x4], #8 // .......................................................e....................|......................................................e.................... + // ldr q0, [x4], #16 // ........................................................e...................|.......................................................e................... + // sub v24.4s, v8.4s, v9.4s // ..........................................................................e.|.........................................................................e. + // add v8.4s, v8.4s, v9.4s // .*..........................................................................|*.......................................................................... + // mul v9.4s, v24.4s, v0.s[0] // ........*...................................................................|.......*................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....*......................................................................|....*...................................................................... + // mls v9.4s, v24.4s, v29.4s // .....................*......................................................|....................*...................................................... + // sub v24.4s, v10.4s, v11.4s // ...*........................................................................|..*........................................................................ + // add v10.4s, v10.4s, v11.4s // ......*.....................................................................|.....*..................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ..........*.................................................................|.........*................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........*..................................................................|........*.................................................................. + // mls v11.4s, v24.4s, v29.4s // ..................*.........................................................|.................*......................................................... + // sub v24.4s, v8.4s, v10.4s // .....................................*......................................|....................................*...................................... + // add v8.4s, v8.4s, v10.4s // ...........*................................................................|..........*................................................................ + // mul v10.4s, v24.4s, v1.s[0] // ......................................................*.....................|.....................................................*..................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................*................................|..........................................*................................ + // mls v10.4s, v24.4s, v29.4s // ...................................................................*........|..................................................................*........ + // sub v24.4s, v9.4s, v11.4s // ....................................*.......................................|...................................*....................................... + // add v9.4s, v9.4s, v11.4s // .......................................*....................................|......................................*.................................... + // mul v11.4s, v24.4s, v1.s[0] // .............................................*..............................|............................................*.............................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................................................*......................|....................................................*...................... + // mls v11.4s, v24.4s, v29.4s // .............................................................*..............|............................................................*.............. + // srshr v24.4S, v8.4S, #23 // ............................................*...............................|...........................................*............................... + // mls v8.4s, v24.4s, v29.4s // ............................................................*...............|...........................................................*............... + // srshr v24.4S, v9.4S, #23 // ..........................................*.................................|.........................................*................................. + // mls v9.4s, v24.4s, v29.4s // ................................................*...........................|...............................................*........................... + // str q8, [x0], #(16*4) // ..................................................................*.........|.................................................................*......... + // str q9, [x0, #(-16*4 + 1*16)] // ......................................................................*.....|.....................................................................*..... + // str q10, [x0, #(-16*4 + 2*16)] // ...........................................................................*|..........................................................................* + // str q11, [x0, #(-16*4 + 3*16)] // .........................................................................*..|........................................................................*.. + + sub count, count, #1 + cbnz count, layer5678_start + sub v23.4S, v22.4S, v5.4S // .*......................... + sqrdmulh v21.4S, v25.4S, v17.S[1] // ..*........................ + // gap // ........................... + // gap // ........................... + add v3.4S, v31.4S, v18.4S // *.......................... + mul v11.4S, v25.4S, v17.S[0] // ....*...................... + // gap // ........................... + // gap // ........................... + sqrdmulh v7.4S, v23.4S, v17.S[3] // .....*..................... + mul v8.4S, v23.4S, v17.S[2] // ......*.................... + // gap // ........................... + // gap // ........................... + add v23.4S, v22.4S, v5.4S // ...*....................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v11.4S, v21.4S, v29.4S // .........*................. + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v8.4S, v7.4S, v29.4S // ........*.................. + add v10.4S, v3.4S, v23.4S // .......*................... + // gap // ........................... + // gap // ........................... + sub v12.4S, v3.4S, v23.4S // ...........*............... + // gap // ........................... + // gap // ........................... + // gap // ........................... + srshr v23.4S, v10.4S, #23 // ...............*........... + // gap // ........................... + // gap // ........................... + // gap // ........................... + sub v18.4S, v11.4S, v8.4S // ..........*................ + mul v21.4S, v12.4S, v30.S[0] // ...................*....... + // gap // ........................... + // gap // ........................... + add v3.4S, v11.4S, v8.4S // ............*.............. + sqrdmulh v11.4S, v12.4S, v30.S[1] // ..............*............ + // gap // ........................... + // gap // ........................... + sqrdmulh v2.4S, v18.4S, v30.S[1] // ..................*........ + mul v18.4S, v18.4S, v30.S[0] // ................*.......... + // gap // ........................... + // gap // ........................... + mls v10.4S, v23.4S, v29.4S // ....................*...... + srshr v30.4S, v3.4S, #23 // .............*............. + // gap // ........................... + // gap // ........................... + mls v21.4S, v11.4S, v29.4S // .......................*... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v18.4S, v2.4S, v29.4S // .....................*..... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v3.4S, v30.4S, v29.4S // .................*......... + str q10, [x0], #(16*4) // ......................*.... + // gap // ........................... + // gap // ........................... + str q21, [x0, #-32] // ..........................* + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q18, [x0, #-16] // .........................*. + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q3, [x0, #-48] // ........................*.. + // gap // ........................... + // gap // ........................... + // gap // ........................... + + // original source code + // add v10.4S, v31.4S, v18.4S // ..*........................ + // sub v0.4S, v22.4S, v5.4S // *.......................... + // sqrdmulh v2.4S, v25.4S, v17.S[1] // .*......................... + // add v5.4S, v22.4S, v5.4S // ......*.................... + // mul v8.4S, v25.4S, v17.S[0] // ...*....................... + // sqrdmulh v19.4S, v0.4S, v17.S[3] // ....*...................... + // mul v9.4S, v0.4S, v17.S[2] // .....*..................... + // add v0.4S, v10.4S, v5.4S // .........*................. + // mls v9.4S, v19.4S, v29.4S // ........*.................. + // mls v8.4S, v2.4S, v29.4S // .......*................... + // sub v2.4S, v8.4S, v9.4S // ............*.............. + // sub v15.4S, v10.4S, v5.4S // ..........*................ + // add v12.4S, v8.4S, v9.4S // ..............*............ + // srshr v9.4S, v12.4S, #23 // ...................*....... + // sqrdmulh v27.4S, v15.4S, v30.S[1] // ...............*........... + // srshr v18.4S, v0.4S, #23 // ...........*............... + // mul v13.4S, v2.4S, v30.S[0] // .................*......... + // mls v12.4S, v9.4S, v29.4S // ......................*.... + // sqrdmulh v2.4S, v2.4S, v30.S[1] // ................*.......... + // mul v19.4S, v15.4S, v30.S[0] // .............*............. + // mls v0.4S, v18.4S, v29.4S // ..................*........ + // mls v13.4S, v2.4S, v29.4S // .....................*..... + // str q0, [x0], #(16*4) // .......................*... + // mls v19.4S, v27.4S, v29.4S // ....................*...... + // str q12, [x0, #-48] // ..........................* + // str q13, [x0, #-16] // .........................*. + // str q19, [x0, #-32] // ........................*.. + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q9, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q20, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + ldr q10, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q11, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q15, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + ldr q19, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q16, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + sub v14.4S, v20.4S, v9.4S // ................*....................................................................................................................................................................................................................................................................... + add v13.4S, v20.4S, v9.4S // .................*...................................................................................................................................................................................................................................................................... + ldr q12, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + sub v22.4S, v10.4S, v11.4S // .....................*.................................................................................................................................................................................................................................................................. + ldr q28, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q24, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mul v27.4S, v14.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v22.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + mul v8.4S, v22.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + ldr q23, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + add v17.4S, v12.4S, v16.4S // ...........................*............................................................................................................................................................................................................................................................ + sub v21.4S, v12.4S, v16.4S // ..........................*............................................................................................................................................................................................................................................................. + ldr q16, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + mls v27.4S, v14.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sub v20.4S, v19.4S, v28.4S // ...............................*........................................................................................................................................................................................................................................................ + ldr q18, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + mls v8.4S, v9.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + mul v9.4S, v21.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + ldr q22, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + mul v12.4S, v20.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v21.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + sqrdmulh v21.4S, v20.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + add v20.4S, v15.4S, v24.4S // ...............................................*........................................................................................................................................................................................................................................ + add v10.4S, v10.4S, v11.4S // ......................*................................................................................................................................................................................................................................................................. + sub v24.4S, v15.4S, v24.4S // ..............................................*......................................................................................................................................................................................................................................... + add v15.4S, v27.4S, v8.4S // ..............................................................*......................................................................................................................................................................................................................... + mls v12.4S, v21.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + mls v9.4S, v14.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + mul v21.4S, v24.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v24.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + sub v24.4S, v13.4S, v10.4S // ........................................................*............................................................................................................................................................................................................................... + sub v11.4S, v27.4S, v8.4S // .............................................................*.......................................................................................................................................................................................................................... + add v27.4S, v9.4S, v12.4S // ........................................................................*............................................................................................................................................................................................................... + sub v12.4S, v9.4S, v12.4S // .......................................................................*................................................................................................................................................................................................................ + mls v21.4S, v14.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sub v14.4S, v23.4S, v16.4S // ....................................*................................................................................................................................................................................................................................................... + add v8.4S, v13.4S, v10.4S // .........................................................*.............................................................................................................................................................................................................................. + sub v13.4S, v15.4S, v27.4S // .....................................................................................................*.................................................................................................................................................................................. + add v9.4S, v15.4S, v27.4S // ......................................................................................................*................................................................................................................................................................................. + mul v27.4S, v14.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + mul v15.4S, v24.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + sqrdmulh v10.4S, v14.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + sqrdmulh v14.4S, v24.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + add v24.4S, v22.4S, v18.4S // ..........................................*............................................................................................................................................................................................................................................. + add v28.4S, v19.4S, v28.4S // ................................*....................................................................................................................................................................................................................................................... + sqrdmulh v19.4S, v11.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + mls v27.4S, v10.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sub v22.4S, v22.4S, v18.4S // .........................................*.............................................................................................................................................................................................................................................. + mls v15.4S, v14.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sub v14.4S, v17.4S, v28.4S // ..................................................................*..................................................................................................................................................................................................................... + sqrdmulh v18.4S, v22.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + mul v10.4S, v22.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v22.4S, v14.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + mul v14.4S, v14.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + add v23.4S, v23.4S, v16.4S // .....................................*.................................................................................................................................................................................................................................................. + add v28.4S, v17.4S, v28.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v10.4S, v18.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v12.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + mls v14.4S, v22.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + mul v22.4S, v11.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mul v11.4S, v12.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + add v16.4S, v23.4S, v24.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v12.4S, v27.4S, v10.4S // .................................................................................*...................................................................................................................................................................................................... + add v17.4S, v27.4S, v10.4S // ..................................................................................*..................................................................................................................................................................................................... + mls v22.4S, v19.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + add v10.4S, v15.4S, v14.4S // ...........................................................................................................*............................................................................................................................................................................ + mul v19.4S, v12.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + sqrdmulh v27.4S, v12.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + sub v12.4S, v8.4S, v28.4S // ................................................................................................*....................................................................................................................................................................................... + mls v11.4S, v18.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + add v8.4S, v8.4S, v28.4S // .................................................................................................*...................................................................................................................................................................................... + sub v28.4S, v15.4S, v14.4S // ..........................................................................................................*............................................................................................................................................................................. + ldr q18, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + ldr q14, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + mls v19.4S, v27.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + sub v15.4S, v23.4S, v24.4S // ............................................................................*........................................................................................................................................................................................................... + sqrdmulh v27.4S, v12.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + sub v23.4S, v14.4S, v18.4S // ...................................................*.................................................................................................................................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + add v24.4S, v14.4S, v18.4S // ....................................................*................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v23.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + mul v23.4S, v23.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v14.4S, v13.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mul v13.4S, v13.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + mls v12.4S, v27.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + add v27.4S, v20.4S, v24.4S // .......................................................................................*................................................................................................................................................................................................ + sub v24.4S, v20.4S, v24.4S // ......................................................................................*................................................................................................................................................................................................. + mls v23.4S, v18.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + mls v13.4S, v14.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + mul v18.4S, v15.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + sub v14.4S, v16.4S, v27.4S // ....................................................................................................................*................................................................................................................................................................... + sub v20.4S, v21.4S, v23.4S // ...........................................................................................*............................................................................................................................................................................................ + add v16.4S, v16.4S, v27.4S // .....................................................................................................................*.................................................................................................................................................................. + add v23.4S, v21.4S, v23.4S // ............................................................................................*........................................................................................................................................................................................... + mul v27.4S, v24.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + mul v21.4S, v20.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mls v18.4S, v15.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + add v15.4S, v17.4S, v23.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v17.4S, v17.4S, v23.4S // .........................................................................................................................*.............................................................................................................................................................. + sqrdmulh v23.4S, v24.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + mls v21.4S, v20.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sub v24.4S, v22.4S, v11.4S // ...............................................................................................................*........................................................................................................................................................................ + mls v27.4S, v23.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + add v20.4S, v19.4S, v21.4S // ....................................................................................................................................*................................................................................................................................................... + sub v23.4S, v9.4S, v15.4S // .............................................................................................................................................*.......................................................................................................................................... + add v9.4S, v9.4S, v15.4S // ..............................................................................................................................................*......................................................................................................................................... + mul v15.4S, v14.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + sub v21.4S, v19.4S, v21.4S // ...................................................................................................................................*.................................................................................................................................................... + sqrdmulh v19.4S, v14.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + sub v14.4S, v18.4S, v27.4S // ..............................................................................................................................*......................................................................................................................................................... + add v18.4S, v18.4S, v27.4S // ...............................................................................................................................*........................................................................................................................................................ + add v22.4S, v22.4S, v11.4S // ................................................................................................................*....................................................................................................................................................................... + mul v27.4S, v14.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mls v15.4S, v19.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + add v11.4S, v22.4S, v20.4S // ........................................................................................................................................................*............................................................................................................................... + sub v19.4S, v22.4S, v20.4S // .......................................................................................................................................................*................................................................................................................................ + mul v20.4S, v17.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + sqrdmulh v17.4S, v17.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + mls v27.4S, v14.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + sub v22.4S, v12.4S, v15.4S // ............................................................................................................................................................*........................................................................................................................... + sqrdmulh v14.4S, v23.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + add v12.4S, v12.4S, v15.4S // .............................................................................................................................................................*.......................................................................................................................... + sub v15.4S, v10.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v20.4S, v17.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + mul v17.4S, v23.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + sqrdmulh v23.4S, v15.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v18.4S, v15.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + mul v15.4S, v24.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + sub count, count, #1 +layer1234_start: + sqrdmulh v24.4S, v24.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + mls v18.4S, v23.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + sqrdmulh v23.4S, v22.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + mul v22.4S, v22.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + mls v17.4S, v14.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mul v14.4S, v28.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sqrdmulh v28.4S, v28.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mls v15.4S, v24.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mls v22.4S, v23.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mul v23.4S, v21.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v24.4S, v21.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + sub v21.4S, v8.4S, v16.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v16.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v14.4S, v28.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + cmge v28.4S, v31.4S, v22.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v16.4S, v22.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v23.4S, v24.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + sqrdmulh v24.4S, v21.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + sub v28.4S, v28.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... + mul v16.4S, v21.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + add v21.4S, v14.4S, v27.4S // .......................................................................................................................................................................*................................................................................................................ + mls v22.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v28.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + sub v27.4S, v14.4S, v27.4S // ......................................................................................................................................................................*................................................................................................................. + cmge v14.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + mls v16.4S, v24.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + cmge v24.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + str q22, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v22.4S, v14.4S, v28.4S // ......................................................................................................................................................................................*................................................................................................. + cmge v14.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + add v28.4S, v15.4S, v23.4S // ............................................................................................................................................................................*........................................................................................................... + sub v15.4S, v15.4S, v23.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v17.4S, v22.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v24.4S, v24.4S, v14.4S // ..........................................................................................................................................................................................*............................................................................................. + sub v14.4S, v13.4S, v20.4S // .................................................................................................................................................................*...................................................................................................................... + add v23.4S, v13.4S, v20.4S // ..................................................................................................................................................................*..................................................................................................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + cmge v22.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mul v13.4S, v14.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mul v17.4S, v23.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sqrdmulh v23.4S, v23.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + sqrdmulh v14.4S, v14.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + sqrdmulh v24.4S, v27.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v27.4S, v27.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mls v17.4S, v23.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v23.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v13.4S, v14.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + mls v27.4S, v24.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + sub v12.4S, v22.4S, v23.4S // ..................................................................................................................................................................................*..................................................................................................... + mul v22.4S, v21.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + sqrdmulh v18.4S, v21.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + sqrdmulh v23.4S, v15.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mul v21.4S, v15.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + cmge v24.4S, v27.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v14.4S, v31.4S, v27.4S // ........................................................................................................................................................................................................*............................................................................... + mls v16.4S, v12.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v12.4S, v28.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v15.4S, v14.4S, v24.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v21.4S, v23.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + mul v24.4S, v19.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v28.4S, v28.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sqrdmulh v23.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + sqrdmulh v16.4S, v19.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + sqrdmulh v19.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v27.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mls v12.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + mul v15.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + mls v24.4S, v16.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mul v9.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v28.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................................*................... + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + cmge v11.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mls v15.4S, v19.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mls v9.4S, v23.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + cmge v14.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + sub v19.4S, v28.4S, v11.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v28.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v23.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................*........................... + sub v14.4S, v28.4S, v14.4S // ......................................................................................................................................................................................................*................................................................................. + str q27, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + cmge v27.4S, v21.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v11.4S, v12.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + cmge v28.4S, v31.4S, v12.4S // ............................................................................................................................................................................................................................................................................*........... + mls v13.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + mls v17.4S, v19.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + cmge v14.4S, v9.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v19.4S, v28.4S, v11.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + str q13, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + mls v22.4S, v18.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + str q17, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + cmge v28.4S, v31.4S, v24.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v17.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v8.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v13.4S, v31.4S, v21.4S // ............................................................................................................................................................................................................*........................................................................... + sub v17.4S, v23.4S, v17.4S // ..............................................................................................................................................................................................................................................................*......................... + sub v8.4S, v11.4S, v8.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v11.4S, v31.4S, v9.4S // ....................................................................................................................................................................................................................................................*................................... + mls v15.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v23.4S, v24.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v16.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v8.4S, v11.4S, v14.4S // ......................................................................................................................................................................................................................................................*................................. + sub v17.4S, v13.4S, v27.4S // ..............................................................................................................................................................................................................*......................................................................... + cmge v27.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + ldr q14, [x1, #80] // .e...................................................................................................................................................................................................................................................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v16.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + ldr q11, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + cmge v13.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + str q15, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + cmge v15.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + sub v27.4S, v27.4S, v16.4S // ..................................................................................................................................................................................................................................................................*..................... + sqrdmulh v16.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + sub v10.4S, v28.4S, v23.4S // ..............................................................................................................................................................................................*......................................................................................... + ldr q28, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + mls v12.4S, v19.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + ldr q19, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + mls v20.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v24.4S, v10.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + add v10.4S, v11.4S, v14.4S // .................e...................................................................................................................................................................................................................................................................... + mls v18.4S, v16.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sub v11.4S, v11.4S, v14.4S // ................e....................................................................................................................................................................................................................................................................... + ldr q14, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v21.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + ldr q17, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... + sub v13.4S, v13.4S, v15.4S // ..........................................................................................................................................................................................................................................................................*............. + sqrdmulh v23.4S, v11.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + str q24, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + ldr q24, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + mul v16.4S, v11.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + sub v27.4S, v17.4S, v14.4S // ..........................e............................................................................................................................................................................................................................................................. + add v15.4S, v24.4S, v19.4S // ................................e....................................................................................................................................................................................................................................................... + ldr q11, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + ldr q20, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + mls v22.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + str q12, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + add v12.4S, v17.4S, v14.4S // ...........................e............................................................................................................................................................................................................................................................ + mul v13.4S, v27.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v27.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + add v17.4S, v12.4S, v15.4S // ...................................................................e.................................................................................................................................................................................................................... + mls v9.4S, v8.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + str q21, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + add v27.4S, v11.4S, v20.4S // ......................e................................................................................................................................................................................................................................................................. + sub v24.4S, v24.4S, v19.4S // ...............................e........................................................................................................................................................................................................................................................ + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + ldr q19, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + mls v16.4S, v23.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + sub v8.4S, v11.4S, v20.4S // .....................e.................................................................................................................................................................................................................................................................. + mls v13.4S, v14.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + sub v14.4S, v12.4S, v15.4S // ..................................................................e..................................................................................................................................................................................................................... + ldr q15, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v8.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + mul v21.4S, v8.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + ldr q11, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + add v23.4S, v10.4S, v27.4S // .........................................................e.............................................................................................................................................................................................................................. + sqrdmulh v12.4S, v24.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + sub v10.4S, v10.4S, v27.4S // ........................................................e............................................................................................................................................................................................................................... + cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + ldr q8, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + mul v24.4S, v24.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + str q9, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + sub v9.4S, v28.4S, v19.4S // .........................................e.............................................................................................................................................................................................................................................. + mls v21.4S, v22.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + sub v20.4S, v27.4S, v20.4S // ..........................................................................................................................................................................................................................................................*............................. + add v27.4S, v28.4S, v19.4S // ..........................................e............................................................................................................................................................................................................................................. + mul v19.4S, v9.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + mls v24.4S, v12.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + mls v18.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sub v12.4S, v16.4S, v21.4S // .............................................................e.......................................................................................................................................................................................................................... + add v21.4S, v16.4S, v21.4S // ..............................................................e......................................................................................................................................................................................................................... + add v16.4S, v15.4S, v8.4S // .....................................e.................................................................................................................................................................................................................................................. + sub v22.4S, v13.4S, v24.4S // .......................................................................e................................................................................................................................................................................................................ + add v20.4S, v13.4S, v24.4S // ........................................................................e............................................................................................................................................................................................................... + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + add v18.4S, v16.4S, v27.4S // .............................................................................e.......................................................................................................................................................................................................... + sqrdmulh v28.4S, v9.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + sub v13.4S, v21.4S, v20.4S // .....................................................................................................e.................................................................................................................................................................................. + add v9.4S, v21.4S, v20.4S // ......................................................................................................e................................................................................................................................................................................. + mul v24.4S, v12.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + mul v21.4S, v22.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + mls v19.4S, v28.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + sqrdmulh v20.4S, v12.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + ldr q12, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v13.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + mul v13.4S, v13.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + sub v27.4S, v16.4S, v27.4S // ............................................................................e........................................................................................................................................................................................................... + mls v13.4S, v28.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + sqrdmulh v16.4S, v14.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + mul v28.4S, v14.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + mul v14.4S, v10.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + sqrdmulh v10.4S, v10.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mls v24.4S, v20.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + sub v15.4S, v15.4S, v8.4S // ....................................e................................................................................................................................................................................................................................................... + sqrdmulh v8.4S, v22.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + mul v22.4S, v27.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + mls v28.4S, v16.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + sqrdmulh v20.4S, v27.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + mls v14.4S, v10.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + mul v27.4S, v15.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + add v16.4S, v11.4S, v12.4S // ...............................................e........................................................................................................................................................................................................................................ + mls v21.4S, v8.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + sqrdmulh v8.4S, v15.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + sub v15.4S, v11.4S, v12.4S // ..............................................e......................................................................................................................................................................................................................................... + mls v22.4S, v20.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + add v10.4S, v14.4S, v28.4S // ...........................................................................................................e............................................................................................................................................................................ + add v11.4S, v24.4S, v21.4S // ................................................................................................................e....................................................................................................................................................................... + sub v28.4S, v14.4S, v28.4S // ..........................................................................................................e............................................................................................................................................................................. + mls v27.4S, v8.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + ldr q14, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + sub v24.4S, v24.4S, v21.4S // ...............................................................................................................e........................................................................................................................................................................ + ldr q20, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + mul v21.4S, v15.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v15.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + mls v21.4S, v15.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + sub v8.4S, v27.4S, v19.4S // .................................................................................e...................................................................................................................................................................................................... + add v19.4S, v27.4S, v19.4S // ..................................................................................e..................................................................................................................................................................................................... + add v12.4S, v14.4S, v20.4S // ....................................................e................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v8.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + mul v15.4S, v8.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + sub v20.4S, v14.4S, v20.4S // ...................................................e.................................................................................................................................................................................................................................... + sub v14.4S, v23.4S, v17.4S // ................................................................................................e....................................................................................................................................................................................... + add v8.4S, v23.4S, v17.4S // .................................................................................................e...................................................................................................................................................................................... + sub v17.4S, v16.4S, v12.4S // ......................................................................................e................................................................................................................................................................................................. + add v12.4S, v16.4S, v12.4S // .......................................................................................e................................................................................................................................................................................................ + mls v15.4S, v27.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + mul v23.4S, v17.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + mul v27.4S, v20.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + add v16.4S, v18.4S, v12.4S // .....................................................................................................................e.................................................................................................................................................................. + sub v12.4S, v18.4S, v12.4S // ....................................................................................................................e................................................................................................................................................................... + sqrdmulh v18.4S, v20.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mls v23.4S, v17.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + sqrdmulh v17.4S, v14.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v20.4S, v14.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + sqrdmulh v14.4S, v12.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mls v27.4S, v18.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mul v12.4S, v12.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + add v18.4S, v22.4S, v23.4S // ...............................................................................................................................e........................................................................................................................................................ + sub v22.4S, v22.4S, v23.4S // ..............................................................................................................................e......................................................................................................................................................... + mls v20.4S, v17.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + sub v23.4S, v10.4S, v18.4S // ..................................................................................................................................................e..................................................................................................................................... + add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................e.................................................................................................................................... + sub v17.4S, v21.4S, v27.4S // ...........................................................................................e............................................................................................................................................................................................ + add v27.4S, v21.4S, v27.4S // ............................................................................................e........................................................................................................................................................................................... + sqrdmulh v21.4S, v22.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... + mls v12.4S, v14.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + sqrdmulh v14.4S, v17.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + mul v18.4S, v17.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + add v17.4S, v19.4S, v27.4S // ..........................................................................................................................e............................................................................................................................................................. + sub v19.4S, v19.4S, v27.4S // .........................................................................................................................e.............................................................................................................................................................. + mul v27.4S, v22.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + sub v22.4S, v20.4S, v12.4S // ............................................................................................................................................................e........................................................................................................................... + add v12.4S, v20.4S, v12.4S // .............................................................................................................................................................e.......................................................................................................................... + mls v18.4S, v14.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + sub v14.4S, v9.4S, v17.4S // .............................................................................................................................................e.......................................................................................................................................... + add v9.4S, v9.4S, v17.4S // ..............................................................................................................................................e......................................................................................................................................... + sqrdmulh v17.4S, v19.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mul v20.4S, v19.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + mls v27.4S, v21.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... + sub v21.4S, v15.4S, v18.4S // ...................................................................................................................................e.................................................................................................................................................... + add v15.4S, v15.4S, v18.4S // ....................................................................................................................................e................................................................................................................................................... + mul v18.4S, v23.4S, v0.S[0] // ....................................................................................................................................................e................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v0.S[1] // .....................................................................................................................................................e.................................................................................................................................. + sub v19.4S, v11.4S, v15.4S // .......................................................................................................................................................e................................................................................................................................ + add v11.4S, v11.4S, v15.4S // ........................................................................................................................................................e............................................................................................................................... + mls v20.4S, v17.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... + mul v15.4S, v24.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + mul v17.4S, v14.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + sqrdmulh v14.4S, v14.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + + // original source code + // ldr q8, [x1, #0] // ...e...................................................................................................................................................................|...................................................................................................................e.................................................................... + // ldr q9, [x1, #(1*(512/8))] // e......................................................................................................................................................................|................................................................................................................e....................................................................... + // ldr q10, [x1, #(2*(512/8))] // .............................e.........................................................................................................................................|.............................................................................................................................................e.......................................... + // ldr q11, [x1, #(3*(512/8))] // ..............................e........................................................................................................................................|..............................................................................................................................................e......................................... + // ldr q12, [x1, #(4*(512/8))] // .....................e.................................................................................................................................................|.....................................................................................................................................e.................................................. + // ldr q13, [x1, #(5*(512/8))] // ..................e....................................................................................................................................................|..................................................................................................................................e..................................................... + // ldr q14, [x1, #(6*(512/8))] // .........................e.............................................................................................................................................|.........................................................................................................................................e.............................................. + // ldr q15, [x1, #(7*(512/8))] // ............e..........................................................................................................................................................|............................................................................................................................e........................................................... + // ldr q16, [x1, #(8*(512/8))] // ...............................................e.......................................................................................................................|...............................................................................................................................................................e........................ + // ldr q17, [x1, #(9*(512/8))] // .......................................................e...............................................................................................................|.......................................................................................................................................................................e................ + // ldr q18, [x1, #(10*(512/8))] // ..........e............................................................................................................................................................|..........................................................................................................................e............................................................. + // ldr q19, [x1, #(11*(512/8))] // ..........................................e............................................................................................................................|..........................................................................................................................................................e............................. + // ldr q20, [x1, #(12*(512/8))] // ..................................................e....................................................................................................................|..................................................................................................................................................................e..................... + // ldr q21, [x1, #(13*(512/8))] // ................................................................................e......................................................................................|........................................................................................................................................................................................ + // ldr q22, [x1, #(14*(512/8))] // ..........................................................................................................e............................................................|........................................................................................................................................................................................ + // ldr q23, [x1, #(15*(512/8))] // ............................................................................................................e..........................................................|........................................................................................................................................................................................ + // sub v24.4s, v8.4s, v9.4s // .................e.....................................................................................................................................................|.................................................................................................................................e...................................................... + // add v8.4s, v8.4s, v9.4s // ...............e.......................................................................................................................................................|...............................................................................................................................e........................................................ + // mul v9.4s, v24.4s, v3.s[2] // ..........................e............................................................................................................................................|..........................................................................................................................................e............................................. + // sqrdmulh v24.4s, v24.4s, v3.s[3] // .......................e...............................................................................................................................................|.......................................................................................................................................e................................................ + // mls v9.4s, v24.4s, v29.4s // ...........................................e...........................................................................................................................|...........................................................................................................................................................e............................ + // sub v24.4s, v10.4s, v11.4s // ............................................e..........................................................................................................................|............................................................................................................................................................e........................... + // add v10.4s, v10.4s, v11.4s // .......................................e...............................................................................................................................|.......................................................................................................................................................e................................ + // mul v11.4s, v24.4s, v4.s[0] // .................................................e.....................................................................................................................|.................................................................................................................................................................e...................... + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ................................................e......................................................................................................................|................................................................................................................................................................e....................... + // mls v11.4s, v24.4s, v29.4s // ............................................................e..........................................................................................................|............................................................................................................................................................................e........... + // sub v24.4s, v12.4s, v13.4s // ...........................e...........................................................................................................................................|...........................................................................................................................................e............................................ + // add v12.4s, v12.4s, v13.4s // .................................e.....................................................................................................................................|.................................................................................................................................................e...................................... + // mul v13.4s, v24.4s, v4.s[2] // ..................................e....................................................................................................................................|..................................................................................................................................................e..................................... + // sqrdmulh v24.4s, v24.4s, v4.s[3] // ...................................e...................................................................................................................................|...................................................................................................................................................e.................................... + // mls v13.4s, v24.4s, v29.4s // .............................................e.........................................................................................................................|.............................................................................................................................................................e.......................... + // sub v24.4s, v14.4s, v15.4s // ........................................e..............................................................................................................................|........................................................................................................................................................e............................... + // add v14.4s, v14.4s, v15.4s // ............................e..........................................................................................................................................|............................................................................................................................................e........................................... + // mul v15.4s, v24.4s, v5.s[0] // ........................................................e..............................................................................................................|........................................................................................................................................................................e............... + // sqrdmulh v24.4s, v24.4s, v5.s[1] // ....................................................e..................................................................................................................|....................................................................................................................................................................e................... + // mls v15.4s, v24.4s, v29.4s // ................................................................e......................................................................................................|................................................................................................................................................................................e....... + // sub v24.4s, v16.4s, v17.4s // ..........................................................................................e............................................................................|........................................................................................................................................................................................ + // add v16.4s, v16.4s, v17.4s // ....................................................................e..................................................................................................|....................................................................................................................................................................................e... + // mul v17.4s, v24.4s, v5.s[2] // ................................................................................................e......................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...................................................................................................e...................................................................|........................................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // .........................................................................................................e.............................................................|........................................................................................................................................................................................ + // sub v24.4s, v18.4s, v19.4s // ...........................................................e...........................................................................................................|...........................................................................................................................................................................e............ + // add v18.4s, v18.4s, v19.4s // ..............................................................e........................................................................................................|..............................................................................................................................................................................e......... + // mul v19.4s, v24.4s, v6.s[0] // ...............................................................e.......................................................................................................|...............................................................................................................................................................................e........ + // sqrdmulh v24.4s, v24.4s, v6.s[1] // .........................................................................e.............................................................................................|........................................................................................................................................................................................ + // mls v19.4s, v24.4s, v29.4s // ..............................................................................e........................................................................................|........................................................................................................................................................................................ + // sub v24.4s, v20.4s, v21.4s // ....................................................................................................e..................................................................|........................................................................................................................................................................................ + // add v20.4s, v20.4s, v21.4s // .................................................................................................e.....................................................................|........................................................................................................................................................................................ + // mul v21.4s, v24.4s, v6.s[2] // .............................................................................................................e.........................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ..............................................................................................................e........................................................|........................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ...............................................................................................................e.......................................................|........................................................................................................................................................................................ + // sub v24.4s, v22.4s, v23.4s // .....................................................................................................................e.................................................|........................................................................................................................................................................................ + // add v22.4s, v22.4s, v23.4s // ..................................................................................................................e....................................................|........................................................................................................................................................................................ + // mul v23.4s, v24.4s, v7.s[0] // .............................................................................................................................e.........................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ................................................................................................................................e......................................|........................................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // .....................................................................................................................................e.................................|........................................................................................................................................................................................ + // sub v24.4s, v8.4s, v10.4s // .....................................................e.................................................................................................................|.....................................................................................................................................................................e.................. + // add v8.4s, v8.4s, v10.4s // ...................................................e...................................................................................................................|...................................................................................................................................................................e.................... + // mul v10.4s, v24.4s, v1.s[2] // .......................................................................................e...............................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ........................................................................................e..............................................................................|........................................................................................................................................................................................ + // mls v10.4s, v24.4s, v29.4s // ...............................................................................................e.......................................................................|........................................................................................................................................................................................ + // sub v24.4s, v9.4s, v11.4s // ..................................................................e....................................................................................................|..................................................................................................................................................................................e..... + // add v9.4s, v9.4s, v11.4s // ...................................................................e...................................................................................................|...................................................................................................................................................................................e.... + // mul v11.4s, v24.4s, v1.s[2] // ............................................................................e..........................................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...............................................................................e.......................................................................................|........................................................................................................................................................................................ + // mls v11.4s, v24.4s, v29.4s // .........................................................................................e.............................................................................|........................................................................................................................................................................................ + // sub v24.4s, v12.4s, v14.4s // ..............................................e........................................................................................................................|..............................................................................................................................................................e......................... + // add v12.4s, v12.4s, v14.4s // ....................................e..................................................................................................................................|....................................................................................................................................................e................................... + // mul v14.4s, v24.4s, v2.s[0] // ......................................................................................e................................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .....................................................................................e.................................................................................|........................................................................................................................................................................................ + // mls v14.4s, v24.4s, v29.4s // .............................................................................................e.........................................................................|........................................................................................................................................................................................ + // sub v24.4s, v13.4s, v15.4s // .....................................................................e.................................................................................................|.....................................................................................................................................................................................e.. + // add v13.4s, v13.4s, v15.4s // ......................................................................e................................................................................................|......................................................................................................................................................................................e. + // mul v15.4s, v24.4s, v2.s[0] // .............................................................................e.........................................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...........................................................................................e...........................................................................|........................................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // ..................................................................................................e....................................................................|........................................................................................................................................................................................ + // sub v24.4s, v16.4s, v18.4s // ...................................................................................e...................................................................................|........................................................................................................................................................................................ + // add v16.4s, v16.4s, v18.4s // ........................................................................e..............................................................................................|........................................................................................................................................................................................ + // mul v18.4s, v24.4s, v2.s[2] // ............................................................................................e..........................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................e........................................................................|........................................................................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // .....................................................................................................e.................................................................|........................................................................................................................................................................................ + // sub v24.4s, v17.4s, v19.4s // ................................................................................................................e......................................................|........................................................................................................................................................................................ + // add v17.4s, v17.4s, v19.4s // .................................................................................................................e.....................................................|........................................................................................................................................................................................ + // mul v19.4s, v24.4s, v2.s[2] // ....................................................................................................................e..................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................................................................................................e...................................................|........................................................................................................................................................................................ + // mls v19.4s, v24.4s, v29.4s // ..........................................................................................................................e............................................|........................................................................................................................................................................................ + // sub v24.4s, v20.4s, v22.4s // ........................................................................................................................e..............................................|........................................................................................................................................................................................ + // add v20.4s, v20.4s, v22.4s // .........................................................................................................................e.............................................|........................................................................................................................................................................................ + // mul v22.4s, v24.4s, v3.s[0] // ...........................................................................................................................e...........................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ............................................................................................................................e..........................................|........................................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // .................................................................................................................................e.....................................|........................................................................................................................................................................................ + // sub v24.4s, v21.4s, v23.4s // ............................................................................................................................................e..........................|........................................................................................................................................................................................ + // add v21.4s, v21.4s, v23.4s // .............................................................................................................................................e.........................|........................................................................................................................................................................................ + // mul v23.4s, v24.4s, v3.s[0] // .................................................................................................................................................e.....................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................................................................e......................|........................................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // .......................................................................................................................................................e...............|........................................................................................................................................................................................ + // sub v24.4s, v8.4s, v12.4s // ......................................................................................................................e................................................|........................................................................................................................................................................................ + // add v8.4s, v8.4s, v12.4s // .......................................................................................................................e...............................................|........................................................................................................................................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ...................................................................................................................................e...................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................................................................................................................e....................................|........................................................................................................................................................................................ + // mls v12.4s, v24.4s, v29.4s // .........................................................................................................................................e.............................|........................................................................................................................................................................................ + // sub v24.4s, v9.4s, v13.4s // ..........................................................................e............................................................................................|........................................................................................................................................................................................ + // add v9.4s, v9.4s, v13.4s // ...........................................................................e...........................................................................................|........................................................................................................................................................................................ + // mul v13.4s, v24.4s, v0.s[2] // ..................................................................................e....................................................................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .................................................................................e.....................................................................................|........................................................................................................................................................................................ + // mls v13.4s, v24.4s, v29.4s // ....................................................................................e..................................................................................|........................................................................................................................................................................................ + // sub v24.4s, v10.4s, v14.4s // ........................................................................................................e..............................................................|........................................................................................................................................................................................ + // add v10.4s, v10.4s, v14.4s // ......................................................................................................e................................................................|........................................................................................................................................................................................ + // mul v14.4s, v24.4s, v0.s[2] // .......................................................................................................................................................................|....*................................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................................|.....*.................................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // .......................................................................................................................................................................|............*........................................................................................................................................................................... + // sub v24.4s, v11.4s, v15.4s // ...........................................................................................................e...........................................................|........................................................................................................................................................................................ + // add v11.4s, v11.4s, v15.4s // .......................................................................................................e...............................................................|........................................................................................................................................................................................ + // mul v15.4s, v24.4s, v0.s[2] // ....................................................................................................................................................................e..|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................................*........................................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // .......................................................................................................................................................................|......*................................................................................................................................................................................. + // sub v24.4s, v16.4s, v20.4s // ...............................................................................................................................e.......................................|........................................................................................................................................................................................ + // add v16.4s, v16.4s, v20.4s // ..............................................................................................................................e........................................|........................................................................................................................................................................................ + // mul v20.4s, v24.4s, v1.s[0] // ......................................................................................................................................e................................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................................................e..................................|........................................................................................................................................................................................ + // mls v20.4s, v24.4s, v29.4s // ...............................................................................................................................................e.......................|........................................................................................................................................................................................ + // sub v24.4s, v17.4s, v21.4s // ...................................................................................................................................................e...................|........................................................................................................................................................................................ + // add v17.4s, v17.4s, v21.4s // ..................................................................................................................................................e....................|........................................................................................................................................................................................ + // mul v21.4s, v24.4s, v1.s[0] // ...........................................................................................................................................................e...........|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................................................e............|........................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ...................................................................................................................................................................e...|........................................................................................................................................................................................ + // sub v24.4s, v18.4s, v22.4s // ........................................................................................................................................e..............................|........................................................................................................................................................................................ + // add v18.4s, v18.4s, v22.4s // .......................................................................................................................................e...............................|........................................................................................................................................................................................ + // mul v22.4s, v24.4s, v1.s[0] // ....................................................................................................................................................e..................|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................e........................|........................................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................e..........|........................................................................................................................................................................................ + // sub v24.4s, v19.4s, v23.4s // .............................................................................................................................................................e.........|........................................................................................................................................................................................ + // add v19.4s, v19.4s, v23.4s // ..............................................................................................................................................................e........|........................................................................................................................................................................................ + // mul v23.4s, v24.4s, v1.s[0] // .......................................................................................................................................................................|........*............................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................................................................................................................................|.........*.............................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // .......................................................................................................................................................................|...............*........................................................................................................................................................................ + // sub v24.4s, v8.4s, v16.4s // .......................................................................................................................................................................|..........*............................................................................................................................................................................. + // add v8.4s, v8.4s, v16.4s // .......................................................................................................................................................................|...........*............................................................................................................................................................................ + // mul v16.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|..................*..................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|................*....................................................................................................................................................................... + // mls v16.4s, v24.4s, v29.4s // .......................................................................................................................................................................|........................*............................................................................................................................................................... + // sub v24.4s, v9.4s, v17.4s // ........................................................................................................................................................e..............|........................................................................................................................................................................................ + // add v9.4s, v9.4s, v17.4s // .........................................................................................................................................................e.............|........................................................................................................................................................................................ + // mul v17.4s, v24.4s, v0.s[0] // .....................................................................................................................................................................e.|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................................................e|........................................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // .......................................................................................................................................................................|...*.................................................................................................................................................................................... + // sub v24.4s, v10.4s, v18.4s // ..........................................................................................................................................e............................|........................................................................................................................................................................................ + // add v10.4s, v10.4s, v18.4s // ...........................................................................................................................................e...........................|........................................................................................................................................................................................ + // mul v18.4s, v24.4s, v0.s[0] // ...............................................................................................................................................................e.......|........................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................................................................................e......|........................................................................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // .......................................................................................................................................................................|*....................................................................................................................................................................................... + // sub v24.4s, v11.4s, v19.4s // .................................................................................................................................................................e.....|........................................................................................................................................................................................ + // add v11.4s, v11.4s, v19.4s // ..................................................................................................................................................................e....|........................................................................................................................................................................................ + // mul v19.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|...............................................................*........................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|...................................................................*.................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // .......................................................................................................................................................................|........................................................................*............................................................................................................... + // sub v24.4s, v12.4s, v20.4s // .....................................................................................................................................................e.................|........................................................................................................................................................................................ + // add v12.4s, v12.4s, v20.4s // ......................................................................................................................................................e................|........................................................................................................................................................................................ + // mul v20.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|..*..................................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|.*...................................................................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // .......................................................................................................................................................................|.......*................................................................................................................................................................................ + // sub v24.4s, v13.4s, v21.4s // .......................................................................................................................................................................|.................................*...................................................................................................................................................... + // add v13.4s, v13.4s, v21.4s // .......................................................................................................................................................................|..................................*..................................................................................................................................................... + // mul v21.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|......................................*................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|...........................................*............................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // .......................................................................................................................................................................|..................................................*..................................................................................................................................... + // sub v24.4s, v14.4s, v22.4s // .......................................................................................................................................................................|......................*................................................................................................................................................................. + // add v14.4s, v14.4s, v22.4s // .......................................................................................................................................................................|...................*.................................................................................................................................................................... + // mul v22.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|.............................................*.......................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|............................................*........................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // .......................................................................................................................................................................|...................................................*.................................................................................................................................... + // sub v24.4s, v15.4s, v23.4s // .......................................................................................................................................................................|..............................*......................................................................................................................................................... + // add v15.4s, v15.4s, v23.4s // .......................................................................................................................................................................|.............................*.......................................................................................................................................................... + // mul v23.4s, v24.4s, v0.s[0] // .......................................................................................................................................................................|........................................................*............................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................................|.......................................................*................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // .......................................................................................................................................................................|..............................................................*......................................................................................................................... + // cmge v27.4s, v31.4s, v16.4s // .......................................................................................................................................................................|....................................*................................................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // .......................................................................................................................................................................|...............................................*........................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|....................................................*................................................................................................................................... + // mls v16.4s, v28.4s, v29.4s // .......................................................................................................................................................................|...........................................................*............................................................................................................................ + // cmge v27.4s, v31.4s, v17.4s // .......................................................................................................................................................................|.......................*................................................................................................................................................................ + // cmge v28.4s, v17.4s, v30.4s // .......................................................................................................................................................................|.....................*.................................................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|...........................*............................................................................................................................................................ + // mls v17.4s, v28.4s, v29.4s // .......................................................................................................................................................................|...............................*........................................................................................................................................................ + // cmge v27.4s, v31.4s, v18.4s // .......................................................................................................................................................................|.........................*.............................................................................................................................................................. + // cmge v28.4s, v18.4s, v30.4s // .......................................................................................................................................................................|............................*........................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|................................*....................................................................................................................................................... + // mls v18.4s, v28.4s, v29.4s // .......................................................................................................................................................................|..........................................*............................................................................................................................................. + // cmge v27.4s, v31.4s, v19.4s // .......................................................................................................................................................................|..................................................................................................*..................................................................................... + // cmge v28.4s, v19.4s, v30.4s // .......................................................................................................................................................................|...........................................................................................................*............................................................................ + // sub v28.4s, v27.4s, v28.4s // .........*.............................................................................................................................................................|.........................................................................................................................*.............................................................. + // mls v19.4s, v28.4s, v29.4s // ..............*........................................................................................................................................................|..............................................................................................................................*......................................................... + // cmge v27.4s, v31.4s, v20.4s // .......................................................................................................................................................................|.............*.......................................................................................................................................................................... + // cmge v28.4s, v20.4s, v30.4s // .......................................................................................................................................................................|..............*......................................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.................*...................................................................................................................................................................... + // mls v20.4s, v28.4s, v29.4s // .......................................................................................................................................................................|....................*................................................................................................................................................................... + // cmge v27.4s, v31.4s, v21.4s // .......................................................................................................................................................................|.................................................................................*...................................................................................................... + // cmge v28.4s, v21.4s, v30.4s // .......................................................................................................................................................................|...............................................................................*........................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|...................................................................................*.................................................................................................... + // mls v21.4s, v28.4s, v29.4s // .......................................................................................................................................................................|.........................................................................................*.............................................................................................. + // cmge v27.4s, v31.4s, v22.4s // .......................................................................................................................................................................|..........................................................*............................................................................................................................. + // cmge v28.4s, v22.4s, v30.4s // .......................................................................................................................................................................|.........................................................*.............................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.............................................................*.......................................................................................................................... + // mls v22.4s, v28.4s, v29.4s // .......................................................................................................................................................................|.....................................................................*.................................................................................................................. + // cmge v27.4s, v31.4s, v23.4s // .......................................................................................................................................................................|......................................................................................................*................................................................................. + // cmge v28.4s, v23.4s, v30.4s // .......................................................................................................................................................................|.....................................................................................*.................................................................................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|..............................................................................................................*......................................................................... + // mls v23.4s, v28.4s, v29.4s // ....................*..................................................................................................................................................|....................................................................................................................................*................................................... + // str q16, [x1, #(8*(512/8))] // .......................................................................................................................................................................|.................................................................*...................................................................................................................... + // str q17, [x1, #(9*(512/8))] // .......................................................................................................................................................................|.....................................*.................................................................................................................................................. + // str q18, [x1, #(10*(512/8))] // .......................................................................................................................................................................|................................................*....................................................................................................................................... + // str q19, [x1, #(11*(512/8))] // ........................*..............................................................................................................................................|........................................................................................................................................*............................................... + // str q20, [x1, #(12*(512/8))] // .......................................................................................................................................................................|..........................*............................................................................................................................................................. + // str q21, [x1, #(13*(512/8))] // .......................................................................................................................................................................|..............................................................................................*......................................................................................... + // str q22, [x1, #(14*(512/8))] // .......................................................................................................................................................................|....................................................................................*................................................................................................... + // str q23, [x1, #(15*(512/8))] // ......................................*................................................................................................................................|......................................................................................................................................................*................................. + // mul v16.4s, v8.4s, v25.4s // .......................................................................................................................................................................|...........................................................................*............................................................................................................ + // sqrdmulh v8.4s, v8.4s, v26.4s // .......................................................................................................................................................................|.......................................................................................*................................................................................................ + // mls v16.4s, v8.4s, v29.4s // .......................................................................................................................................................................|.............................................................................................*.......................................................................................... + // mul v17.4s, v9.4s, v25.4s // .......................................................................................................................................................................|.........................................................................*.............................................................................................................. + // sqrdmulh v9.4s, v9.4s, v26.4s // .......................................................................................................................................................................|..................................................................*..................................................................................................................... + // mls v17.4s, v9.4s, v29.4s // .......................................................................................................................................................................|..............................................................................*......................................................................................................... + // mul v18.4s, v10.4s, v25.4s // .......................................................................................................................................................................|................................................................................................*....................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ........*..............................................................................................................................................................|........................................................................................................................*............................................................... + // mls v18.4s, v10.4s, v29.4s // ................*......................................................................................................................................................|................................................................................................................................*....................................................... + // mul v19.4s, v11.4s, v25.4s // .......................................................................................................................................................................|.......................................................................*................................................................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // .......................................................................................................................................................................|....................................................................*................................................................................................................... + // mls v19.4s, v11.4s, v29.4s // .......................................................................................................................................................................|.............................................................................*.......................................................................................................... + // mul v20.4s, v12.4s, v25.4s // .......................................................................................................................................................................|...................................*.................................................................................................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // .......................................................................................................................................................................|.......................................*................................................................................................................................................ + // mls v20.4s, v12.4s, v29.4s // .......................................................................................................................................................................|.................................................*...................................................................................................................................... + // mul v21.4s, v13.4s, v25.4s // .......................................................................................................................................................................|........................................*............................................................................................................................................... + // sqrdmulh v13.4s, v13.4s, v26.4s // .......................................................................................................................................................................|.........................................*.............................................................................................................................................. + // mls v21.4s, v13.4s, v29.4s // .......................................................................................................................................................................|..............................................*......................................................................................................................................... + // mul v22.4s, v14.4s, v25.4s // .......................................................................................................................................................................|.....................................................*.................................................................................................................................. + // sqrdmulh v14.4s, v14.4s, v26.4s // .......................................................................................................................................................................|......................................................*................................................................................................................................. + // mls v22.4s, v14.4s, v29.4s // .......................................................................................................................................................................|...............................................................................................*........................................................................................ + // mul v23.4s, v15.4s, v25.4s // .......................................................................................................................................................................|............................................................*........................................................................................................................... + // sqrdmulh v15.4s, v15.4s, v26.4s // .......................................................................................................................................................................|................................................................*....................................................................................................................... + // mls v23.4s, v15.4s, v29.4s // .......................................................................................................................................................................|......................................................................*................................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // .......................................................................................................................................................................|.....................................................................................................*.................................................................................. + // cmge v28.4s, v16.4s, v30.4s // .......................................................................................................................................................................|....................................................................................................*................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|........................................................................................................*............................................................................... + // mls v16.4s, v28.4s, v29.4s // .......................................................................................................................................................................|............................................................................................................*........................................................................... + // cmge v27.4s, v31.4s, v17.4s // .......................................................................................................................................................................|.........................................................................................................*.............................................................................. + // cmge v28.4s, v17.4s, v30.4s // .......................................................................................................................................................................|...........................................................................................*............................................................................................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.............................................................................................................*.......................................................................... + // mls v17.4s, v28.4s, v29.4s // .....................................*.................................................................................................................................|.....................................................................................................................................................*.................................. + // cmge v27.4s, v31.4s, v18.4s // ..........................................................*............................................................................................................|..........................................................................................................................................................................*............. + // cmge v28.4s, v18.4s, v30.4s // ......................................................*................................................................................................................|......................................................................................................................................................................*................. + // sub v28.4s, v27.4s, v28.4s // .............................................................*.........................................................................................................|.............................................................................................................................................................................*.......... + // mls v18.4s, v28.4s, v29.4s // .................................................................*.....................................................................................................|.................................................................................................................................................................................*...... + // cmge v27.4s, v31.4s, v19.4s // .......................................................................................................................................................................|..................................................................................*..................................................................................................... + // cmge v28.4s, v19.4s, v30.4s // .......................................................................................................................................................................|...................................................................................................*.................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|.......................................................................................................*................................................................................ + // mls v19.4s, v28.4s, v29.4s // .......................................................................................................................................................................|..........................................................................................................*............................................................................. + // cmge v27.4s, v31.4s, v20.4s // .......................................................................................................................................................................|...............................................................................................................*........................................................................ + // cmge v28.4s, v20.4s, v30.4s // ..*....................................................................................................................................................................|..................................................................................................................*..................................................................... + // sub v28.4s, v27.4s, v28.4s // .......*...............................................................................................................................................................|.......................................................................................................................*................................................................ + // mls v20.4s, v28.4s, v29.4s // .............*.........................................................................................................................................................|.............................................................................................................................*.......................................................... + // cmge v27.4s, v31.4s, v21.4s // .......................................................................................................................................................................|..........................................................................*............................................................................................................. + // cmge v28.4s, v21.4s, v30.4s // .......................................................................................................................................................................|............................................................................*........................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|................................................................................*....................................................................................................... + // mls v21.4s, v28.4s, v29.4s // .......................................................................................................................................................................|..........................................................................................*............................................................................................. + // cmge v27.4s, v31.4s, v22.4s // ....*..................................................................................................................................................................|....................................................................................................................*................................................................... + // cmge v28.4s, v22.4s, v30.4s // ......*................................................................................................................................................................|......................................................................................................................*................................................................. + // sub v28.4s, v27.4s, v28.4s // ......................*................................................................................................................................................|......................................................................................................................................*................................................. + // mls v22.4s, v28.4s, v29.4s // ...............................*.......................................................................................................................................|...............................................................................................................................................*........................................ + // cmge v27.4s, v31.4s, v23.4s // .......................................................................................................................................................................|........................................................................................*............................................................................................... + // cmge v28.4s, v23.4s, v30.4s // .......................................................................................................................................................................|......................................................................................*................................................................................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................................................................|............................................................................................*........................................................................................... + // mls v23.4s, v28.4s, v29.4s // ...........*...........................................................................................................................................................|...........................................................................................................................*............................................................ + // str q16, [x1], #(16) // .*.....................................................................................................................................................................|.................................................................................................................*...................................................................... + // str q17, [x1, #(-16 + 1*(512/8))] // .........................................................*.............................................................................................................|.........................................................................................................................................................................*.............. + // str q18, [x1, #(-16 + 2*(512/8))] // .......................................................................*...............................................................................................|.......................................................................................................................................................................................* + // str q19, [x1, #(-16 + 3*(512/8))] // .....*.................................................................................................................................................................|.....................................................................................................................*.................................................................. + // str q20, [x1, #(-16 + 4*(512/8))] // ...................*...................................................................................................................................................|...................................................................................................................................*.................................................... + // str q21, [x1, #(-16 + 5*(512/8))] // .......................................................................................................................................................................|.................................................................................................*...................................................................................... + // str q22, [x1, #(-16 + 6*(512/8))] // .........................................*.............................................................................................................................|.........................................................................................................................................................*.............................. + // str q23, [x1, #(-16 + 7*(512/8))] // ................................*......................................................................................................................................|................................................................................................................................................*....................................... + + sub count, count, #1 + cbnz count, layer1234_start + mls v17.4S, v14.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sqrdmulh v14.4S, v24.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + mul v24.4S, v22.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v22.4S, v22.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + mls v18.4S, v23.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + sub v23.4S, v13.4S, v20.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v13.4S, v20.4S // ..................................................................................................................................................................*..................................................................................................................... + mls v15.4S, v14.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + sqrdmulh v14.4S, v21.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mls v24.4S, v22.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mul v20.4S, v21.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mul v21.4S, v23.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sqrdmulh v22.4S, v23.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + sqrdmulh v23.4S, v28.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mul v28.4S, v28.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + mls v20.4S, v14.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + cmge v14.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mls v21.4S, v22.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v22.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + mls v28.4S, v23.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + cmge v23.4S, v31.4S, v24.4S // ................................................................................................................................................................................................*....................................................................................... + sub v22.4S, v22.4S, v14.4S // ......................................................................................................................................................................................*................................................................................................. + sub v14.4S, v8.4S, v16.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v16.4S // .........................................................................................................................................*.............................................................................................................................................. + cmge v16.4S, v24.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v17.4S, v22.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + add v22.4S, v28.4S, v27.4S // .......................................................................................................................................................................*................................................................................................................ + sub v23.4S, v23.4S, v16.4S // ..................................................................................................................................................................................................*..................................................................................... + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mul v16.4S, v14.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sub v28.4S, v28.4S, v27.4S // ......................................................................................................................................................................*................................................................................................................. + mls v24.4S, v23.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sqrdmulh v23.4S, v22.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mul v22.4S, v22.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + sqrdmulh v17.4S, v14.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + cmge v14.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + sub v27.4S, v15.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ + str q24, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + cmge v24.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + add v15.4S, v15.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... + sqrdmulh v20.4S, v28.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mls v16.4S, v17.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + mls v22.4S, v23.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v14.4S, v14.4S, v24.4S // ..........................................................................................................................................................................................*............................................................................................. + mul v17.4S, v28.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v24.4S, v27.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mls v18.4S, v14.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v23.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + cmge v14.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + mls v17.4S, v20.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v20.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................*................................................................................... + sub v28.4S, v28.4S, v23.4S // ..................................................................................................................................................................................*..................................................................................................... + mul v23.4S, v27.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v27.4S, v19.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mul v19.4S, v19.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sub v14.4S, v20.4S, v14.4S // ......................................................................................................................................................................................................*................................................................................. + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + mls v16.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v28.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sqrdmulh v18.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v13.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mls v19.4S, v27.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mls v21.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sqrdmulh v16.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v28.4S, v18.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v24.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v20.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v14.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + cmge v18.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + str q21, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + cmge v21.4S, v31.4S, v28.4S // ....................................................................................................................................................................................................................................................................*................... + cmge v27.4S, v28.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + sub v14.4S, v14.4S, v18.4S // ..............................................................................................................................................................................................*......................................................................................... + sub v18.4S, v20.4S, v24.4S // ..............................................................................................................................................................................................................*......................................................................... + sub v27.4S, v21.4S, v27.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v20.4S, v31.4S, v17.4S // ........................................................................................................................................................................................................*............................................................................... + mls v23.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v21.4S, v17.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mls v28.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + mls v19.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + sub v20.4S, v20.4S, v21.4S // ..........................................................................................................................................................................................................*............................................................................. + sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mls v17.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + sqrdmulh v14.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mul v12.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + str q28, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + str q17, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sqrdmulh v21.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mul v15.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v18.4S, v14.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sqrdmulh v27.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mls v17.4S, v16.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + cmge v23.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mls v15.4S, v13.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + cmge v11.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v8.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v16.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + mls v12.4S, v21.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + sub v9.4S, v11.4S, v8.4S // ..........................................................................................................................................................................................................................................................*............................. + cmge v19.4S, v15.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v28.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v21.4S, v31.4S, v15.4S // ............................................................................................................................................................................................................................................................*........................... + sub v14.4S, v21.4S, v19.4S // ..............................................................................................................................................................................................................................................................*......................... + cmge v11.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + mls v18.4S, v9.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + mls v15.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v13.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sub v24.4S, v27.4S, v11.4S // ......................................................................................................................................................................................................................................................*................................. + cmge v27.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q18, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... + cmge v8.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + mls v17.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sub v28.4S, v28.4S, v27.4S // ..........................................................................................................................................................................................................................................................................*............. + str q15, [x1, #192] // ...................................................................................................................................................................................................................................................................................*.... + cmge v21.4S, v12.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v24.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + sub v15.4S, v23.4S, v8.4S // ..................................................................................................................................................................................................................................................................*..................... + cmge v27.4S, v31.4S, v12.4S // ............................................................................................................................................................................................................................................................................*........... + mls v22.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + sub v13.4S, v13.4S, v24.4S // ..................................................................................................................................................................................................................................................*..................................... + str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + mls v20.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + sub v14.4S, v27.4S, v21.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v16.4S, v13.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + mls v12.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + str q22, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. + str q20, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + str q12, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a55.s b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a55.s new file mode 100644 index 00000000..92b51ecb --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a55.s @@ -0,0 +1,1718 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_opt_a55 + .global _intt_dilithium_1234_5678_opt_a55 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_opt_a55: +_intt_dilithium_1234_5678_opt_a55: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ld4 {v7.4S, v8.4S, v9.4S, v10.4S}, [x0] // *.......................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q17, [x3, #80] // ............*.............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v3.4S, v9.4S, v10.4S // ...*....................................... + // gap // ........................................... + add v6.4S, v9.4S, v10.4S // ....*...................................... + // gap // ........................................... + sub v28.4S, v7.4S, v8.4S // .*......................................... + // gap // ........................................... + sqrdmulh v17.4S, v3.4S, v17.4S // ..............*............................ + // gap // ........................................... + ldr q21, [x3, #64] // ...........*............................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q14, [x3, #48] // .......*................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q2, [x3, #32] // ......*.................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v24.4S, v3.4S, v21.4S // .............*............................. + // gap // ........................................... + sqrdmulh v14.4S, v28.4S, v14.4S // ..........*................................ + // gap // ........................................... + mul v28.4S, v28.4S, v2.4S // .........*................................. + // gap // ........................................... + ldr q21, [x3, #16] // ..................*........................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v31.4S, v7.4S, v8.4S // ..*........................................ + // gap // ........................................... + mls v24.4S, v17.4S, v29.4S // ................*.......................... + // gap // ........................................... + mls v28.4S, v14.4S, v29.4S // ...............*........................... + // gap // ........................................... + sub v3.4S, v31.4S, v6.4S // .....*..................................... + // gap // ........................................... + ldr q14, [x3], #(6*16) // .................*......................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v17.4S, v28.4S, v24.4S // ...................*....................... + // gap // ........................................... + sqrdmulh v2.4S, v3.4S, v21.4S // .....................*..................... + // gap // ........................................... + mul v3.4S, v3.4S, v14.4S // ....................*...................... + // gap // ........................................... + sqrdmulh v21.4S, v17.4S, v21.4S // .......................*................... + // gap // ........................................... + mul v14.4S, v17.4S, v14.4S // ......................*.................... + // gap // ........................................... + add v6.4S, v31.4S, v6.4S // ........*.................................. + // gap // ........................................... + add v17.4S, v28.4S, v24.4S // .........................*................. + // gap // ........................................... + mls v3.4S, v2.4S, v29.4S // ........................*.................. + // gap // ........................................... + mls v14.4S, v21.4S, v29.4S // ..........................*................ + // gap // ........................................... + trn2 v2.4S, v6.4S, v17.4S // ............................*.............. + // gap // ........................................... + trn1 v24.4S, v6.4S, v17.4S // ...........................*............... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn2 v21.4S, v3.4S, v14.4S // ..............................*............ + // gap // ........................................... + trn1 v14.4S, v3.4S, v14.4S // .............................*............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn1 v17.2D, v2.2D, v21.2D // ..................................*........ + // gap // ........................................... + trn2 v3.2D, v2.2D, v21.2D // .................................*......... + // gap // ........................................... + trn1 v2.2D, v24.2D, v14.2D // ................................*.......... + // gap // ........................................... + trn2 v31.2D, v24.2D, v14.2D // ...............................*........... + // gap // ........................................... + add v15.4S, v2.4S, v17.4S // ....................................*...... + // gap // ........................................... + add v26.4S, v31.4S, v3.4S // ...................................*....... + // gap // ........................................... + ldr q13, [x4], #8 // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v14.4S, v15.4S, v26.4S // .....................................*..... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + srshr v21.4S, v14.4S, #23 // ......................................*.... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v14.4S, v21.4S, v29.4S // ........................................*.. + // gap // ........................................... + ldr q1, [x4], #16 // .........................................*. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q14, [x0], #(16*4) // ..........................................* + // gap // ........................................... + + // original source code + // ld4 {v6.4S, v7.4S, v8.4S, v9.4S}, [x0] // *.......................................... + // sub v14.4S, v6.4S, v7.4S // ....*...................................... + // add v2.4S, v6.4S, v7.4S // .............*............................. + // sub v3.4S, v8.4S, v9.4S // ..*........................................ + // add v24.4S, v8.4S, v9.4S // ...*....................................... + // sub v17.4S, v2.4S, v24.4S // ................*.......................... + // ldr q21, [x3, #32] // ........*.................................. + // ldr q26, [x3, #48] // .......*................................... + // add v2.4S, v2.4S, v24.4S // .......................*................... + // mul v21.4S, v14.4S, v21.4S // ...........*............................... + // sqrdmulh v14.4S, v14.4S, v26.4S // ..........*................................ + // ldr q24, [x3, #64] // ......*.................................... + // ldr q26, [x3, #80] // .*......................................... + // mul v24.4S, v3.4S, v24.4S // .........*................................. + // sqrdmulh v3.4S, v3.4S, v26.4S // .....*..................................... + // mls v21.4S, v14.4S, v29.4S // ...............*........................... + // mls v24.4S, v3.4S, v29.4S // ..............*............................ + // ldr q14, [x3], #(6*16) // .................*......................... + // ldr q3, [x3, #-80] // ............*.............................. + // sub v6.4S, v21.4S, v24.4S // ..................*........................ + // mul v31.4S, v17.4S, v14.4S // ....................*...................... + // sqrdmulh v17.4S, v17.4S, v3.4S // ...................*....................... + // mul v14.4S, v6.4S, v14.4S // ......................*.................... + // sqrdmulh v3.4S, v6.4S, v3.4S // .....................*..................... + // mls v31.4S, v17.4S, v29.4S // .........................*................. + // add v17.4S, v21.4S, v24.4S // ........................*.................. + // mls v14.4S, v3.4S, v29.4S // ..........................*................ + // trn1 v3.4S, v2.4S, v17.4S // ............................*.............. + // trn2 v17.4S, v2.4S, v17.4S // ...........................*............... + // trn1 v2.4S, v31.4S, v14.4S // ..............................*............ + // trn2 v14.4S, v31.4S, v14.4S // .............................*............. + // trn2 v31.2D, v3.2D, v2.2D // ..................................*........ + // trn1 v2.2D, v3.2D, v2.2D // .................................*......... + // trn2 v3.2D, v17.2D, v14.2D // ................................*.......... + // trn1 v17.2D, v17.2D, v14.2D // ...............................*........... + // add v26.4S, v31.4S, v3.4S // ....................................*...... + // add v15.4S, v2.4S, v17.4S // ...................................*....... + // add v14.4S, v15.4S, v26.4S // ......................................*.... + // srshr v21.4S, v14.4S, #23 // .......................................*... + // ldr q13, [x4], #8 // .....................................*..... + // mls v14.4S, v21.4S, v29.4S // ........................................*.. + // ldr q1, [x4], #16 // .........................................*. + // str q14, [x0], #(16*4) // ..........................................* + + sub count, count, #1 +layer5678_start: + ld4 {v6.4S, v7.4S, v8.4S, v9.4S}, [x0] // e................................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v17.4S, v2.4S, v17.4S // .....................................*........................... + // gap // ................................................................. + sub v21.4S, v31.4S, v3.4S // ..........................................*...................... + // gap // ................................................................. + sub v14.4S, v6.4S, v7.4S // .......e......................................................... + // gap // ................................................................. + add v2.4S, v6.4S, v7.4S // ........e........................................................ + // gap // ................................................................. + sub v3.4S, v8.4S, v9.4S // ............e.................................................... + // gap // ................................................................. + add v24.4S, v8.4S, v9.4S // .............e................................................... + // gap // ................................................................. + mul v28.4S, v17.4S, v1.S[0] // .......................................*......................... + // gap // ................................................................. + sqrdmulh v17.4S, v17.4S, v1.S[1] // ........................................*........................ + // gap // ................................................................. + mul v6.4S, v21.4S, v1.S[2] // ............................................*.................... + // gap // ................................................................. + sqrdmulh v21.4S, v21.4S, v1.S[3] // .............................................*................... + // gap // ................................................................. + sub v31.4S, v15.4S, v26.4S // ...............................................*................. + // gap // ................................................................. + mls v28.4S, v17.4S, v29.4S // .........................................*....................... + // gap // ................................................................. + sub v17.4S, v2.4S, v24.4S // .................e............................................... + // gap // ................................................................. + mls v6.4S, v21.4S, v29.4S // ..............................................*.................. + // gap // ................................................................. + ldr q21, [x3, #32] // ...e............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q26, [x3, #48] // ....e............................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v2.4S, v2.4S, v24.4S // ..................e.............................................. + // gap // ................................................................. + mul v21.4S, v14.4S, v21.4S // .........e....................................................... + // gap // ................................................................. + sqrdmulh v14.4S, v14.4S, v26.4S // ..........e...................................................... + // gap // ................................................................. + ldr q24, [x3, #64] // .....e........................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q26, [x3, #80] // ......e.......................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v1.4S, v31.4S, v13.S[0] // .................................................*............... + // gap // ................................................................. + mul v24.4S, v3.4S, v24.4S // ..............e.................................................. + // gap // ................................................................. + sqrdmulh v3.4S, v3.4S, v26.4S // ...............e................................................. + // gap // ................................................................. + sqrdmulh v31.4S, v31.4S, v13.S[1] // ..................................................*.............. + // gap // ................................................................. + sub v26.4S, v28.4S, v6.4S // ....................................................*............ + // gap // ................................................................. + add v28.4S, v28.4S, v6.4S // .....................................................*........... + // gap // ................................................................. + mls v21.4S, v14.4S, v29.4S // ...........e..................................................... + // gap // ................................................................. + mls v1.4S, v31.4S, v29.4S // ...................................................*............. + // gap // ................................................................. + mls v24.4S, v3.4S, v29.4S // ................e................................................ + // gap // ................................................................. + ldr q14, [x3], #(6*16) // .e............................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q3, [x3, #-80] // ..e.............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v6.4S, v21.4S, v24.4S // ......................e.......................................... + // gap // ................................................................. + mul v31.4S, v17.4S, v14.4S // ...................e............................................. + // gap // ................................................................. + sqrdmulh v17.4S, v17.4S, v3.4S // ....................e............................................ + // gap // ................................................................. + mul v14.4S, v6.4S, v14.4S // ........................e........................................ + // gap // ................................................................. + sqrdmulh v3.4S, v6.4S, v3.4S // .........................e....................................... + // gap // ................................................................. + sqrdmulh v6.4S, v26.4S, v13.S[1] // .......................................................*......... + // gap // ................................................................. + mls v31.4S, v17.4S, v29.4S // .....................e........................................... + // gap // ................................................................. + add v17.4S, v21.4S, v24.4S // .......................e......................................... + // gap // ................................................................. + mls v14.4S, v3.4S, v29.4S // ..........................e...................................... + // gap // ................................................................. + mul v21.4S, v26.4S, v13.S[0] // ......................................................*.......... + // gap // ................................................................. + trn1 v3.4S, v2.4S, v17.4S // ...........................e..................................... + // gap // ................................................................. + trn2 v17.4S, v2.4S, v17.4S // ............................e.................................... + // gap // ................................................................. + trn1 v2.4S, v31.4S, v14.4S // .............................e................................... + // gap // ................................................................. + srshr v24.4S, v28.4S, #23 // ...........................................................*..... + // gap // ................................................................. + trn2 v14.4S, v31.4S, v14.4S // ..............................e.................................. + // gap // ................................................................. + trn2 v31.2D, v3.2D, v2.2D // ...............................e................................. + // gap // ................................................................. + trn1 v2.2D, v3.2D, v2.2D // .................................e............................... + // gap // ................................................................. + trn2 v3.2D, v17.2D, v14.2D // ................................e................................ + // gap // ................................................................. + trn1 v17.2D, v17.2D, v14.2D // ..................................e.............................. + // gap // ................................................................. + add v26.4S, v31.4S, v3.4S // ...........................................e..................... + // gap // ................................................................. + add v15.4S, v2.4S, v17.4S // ......................................e.......................... + // gap // ................................................................. + mls v21.4S, v6.4S, v29.4S // ........................................................*........ + // gap // ................................................................. + str q1, [x0, #-32] // ...............................................................*. + // gap // ................................................................. + add v14.4S, v15.4S, v26.4S // ................................................e................ + // gap // ................................................................. + mls v28.4S, v24.4S, v29.4S // ............................................................*.... + // gap // ................................................................. + str q21, [x0, #-16] // ................................................................* + // gap // ................................................................. + srshr v21.4S, v14.4S, #23 // .........................................................e....... + // gap // ................................................................. + ldr q13, [x4], #8 // ...................................e............................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v14.4S, v21.4S, v29.4S // ..........................................................e...... + // gap // ................................................................. + str q28, [x0, #-48] // ..............................................................*.. + // gap // ................................................................. + ldr q1, [x4], #16 // ....................................e............................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q14, [x0], #(16*4) // .............................................................e... + // gap // ................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e................................................................e.............................................................. + // ldr q0, [x3], #(6*16) // ...............................e.................................|..............................e............................... + // ldr q4, [x3, #(-6*16 + 1*16)] // ................................e................................|...............................e.............................. + // ldr q1, [x3, #(-6*16 + 2*16)] // ...............e.................................................|..............e............................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ................e................................................|...............e.............................................. + // ldr q2, [x3, #(-6*16 + 4*16)] // ....................e............................................|...................e.......................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // .....................e...........................................|....................e......................................... + // sub v24.4s, v8.4s, v9.4s // ...e.............................................................|..e........................................................... + // add v8.4s, v8.4s, v9.4s // ....e............................................................|...e.......................................................... + // mul v9.4s, v24.4s, v1.4s // ..................e..............................................|.................e............................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ...................e.............................................|..................e........................................... + // mls v9.4s, v24.4s, v29.4s // ............................e....................................|...........................e.................................. + // sub v24.4s, v10.4s, v11.4s // .....e...........................................................|....e......................................................... + // add v10.4s, v10.4s, v11.4s // ......e..........................................................|.....e........................................................ + // mul v11.4s, v24.4s, v2.4s // .......................e.........................................|......................e....................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ........................e........................................|.......................e...................................... + // mls v11.4s, v24.4s, v29.4s // ..............................e..................................|.............................e................................ + // sub v24.4s, v8.4s, v10.4s // .............e...................................................|............e................................................. + // add v8.4s, v8.4s, v10.4s // .................e...............................................|................e............................................. + // mul v10.4s, v24.4s, v0.4s // ..................................e..............................|.................................e............................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................e.............................|..................................e........................... + // mls v10.4s, v24.4s, v29.4s // .......................................e.........................|......................................e....................... + // sub v24.4s, v9.4s, v11.4s // .................................e...............................|................................e............................. + // add v9.4s, v9.4s, v11.4s // ........................................e........................|.......................................e...................... + // mul v11.4s, v24.4s, v0.4s // ....................................e............................|...................................e.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .....................................e...........................|....................................e......................... + // mls v11.4s, v24.4s, v29.4s // .........................................e.......................|........................................e..................... + // trn1 v25.4s, v8.4s, v9.4s // ...........................................e.....................|..........................................e................... + // trn2 v26.4s, v8.4s, v9.4s // ............................................e....................|...........................................e.................. + // trn1 v27.4s, v10.4s, v11.4s // .............................................e...................|............................................e................. + // trn2 v28.4s, v10.4s, v11.4s // ...............................................e.................|..............................................e............... + // trn2 v10.2d, v25.2d, v27.2d // ................................................e................|...............................................e.............. + // trn2 v11.2d, v26.2d, v28.2d // ..................................................e..............|.................................................e............ + // trn1 v8.2d, v25.2d, v27.2d // .................................................e...............|................................................e............. + // trn1 v9.2d, v26.2d, v28.2d // ...................................................e.............|..................................................e........... + // ldr q1, [x4], #8 // ............................................................e....|...........................................................e.. + // ldr q0, [x4], #16 // ...............................................................e.|.............................................................. + // sub v24.4s, v8.4s, v9.4s // .*...............................................................|*............................................................. + // add v8.4s, v8.4s, v9.4s // .....................................................e...........|....................................................e......... + // mul v9.4s, v24.4s, v0.s[0] // .......*.........................................................|......*....................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........*........................................................|.......*...................................................... + // mls v9.4s, v24.4s, v29.4s // ............*....................................................|...........*.................................................. + // sub v24.4s, v10.4s, v11.4s // ..*..............................................................|.*............................................................ + // add v10.4s, v10.4s, v11.4s // ....................................................e............|...................................................e.......... + // mul v11.4s, v24.4s, v0.s[2] // .........*.......................................................|........*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........*......................................................|.........*.................................................... + // mls v11.4s, v24.4s, v29.4s // ..............*..................................................|.............*................................................ + // sub v24.4s, v8.4s, v10.4s // ...........*.....................................................|..........*................................................... + // add v8.4s, v8.4s, v10.4s // ........................................................e........|.......................................................e...... + // mul v10.4s, v24.4s, v1.s[0] // ......................*..........................................|.....................*........................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................*.......................................|........................*..................................... + // mls v10.4s, v24.4s, v29.4s // .............................*...................................|............................*................................. + // sub v24.4s, v9.4s, v11.4s // ..........................*......................................|.........................*.................................... + // add v9.4s, v9.4s, v11.4s // ...........................*.....................................|..........................*................................... + // mul v11.4s, v24.4s, v1.s[0] // ..........................................*......................|.........................................*.................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................*..........................|.....................................*........................ + // mls v11.4s, v24.4s, v29.4s // ......................................................*..........|.....................................................*........ + // srshr v24.4S, v8.4S, #23 // ...........................................................e.....|..........................................................e... + // mls v8.4s, v24.4s, v29.4s // .............................................................e...|............................................................e. + // srshr v24.4S, v9.4S, #23 // ..............................................*..................|.............................................*................ + // mls v9.4s, v24.4s, v29.4s // .........................................................*.......|........................................................*..... + // str q8, [x0], #(16*4) // ................................................................e|.............................................................. + // str q9, [x0, #(-16*4 + 1*16)] // ..............................................................*..|.............................................................* + // str q10, [x0, #(-16*4 + 2*16)] // .......................................................*.........|......................................................*....... + // str q11, [x0, #(-16*4 + 3*16)] // ..........................................................*......|.........................................................*.... + + sub count, count, #1 + cbnz count, layer5678_start + sub v16.4S, v31.4S, v3.4S // .*.................... + // gap // ...................... + sub v24.4S, v2.4S, v17.4S // *..................... + // gap // ...................... + sub v20.4S, v15.4S, v26.4S // ......*............... + // gap // ...................... + mul v17.4S, v16.4S, v1.S[2] // ....*................. + // gap // ...................... + sqrdmulh v28.4S, v16.4S, v1.S[3] // .....*................ + // gap // ...................... + mul v26.4S, v24.4S, v1.S[0] // ..*................... + // gap // ...................... + sqrdmulh v9.4S, v24.4S, v1.S[1] // ...*.................. + // gap // ...................... + sqrdmulh v2.4S, v20.4S, v13.S[1] // ..........*........... + // gap // ...................... + mls v17.4S, v28.4S, v29.4S // ........*............. + // gap // ...................... + mul v21.4S, v20.4S, v13.S[0] // .........*............ + // gap // ...................... + mls v26.4S, v9.4S, v29.4S // .......*.............. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v21.4S, v2.4S, v29.4S // .............*........ + // gap // ...................... + add v3.4S, v26.4S, v17.4S // ............*......... + // gap // ...................... + sub v15.4S, v26.4S, v17.4S // ...........*.......... + // gap // ...................... + // gap // ...................... + // gap // ...................... + srshr v12.4S, v3.4S, #23 // ................*..... + // gap // ...................... + sqrdmulh v28.4S, v15.4S, v13.S[1] // ..............*....... + // gap // ...................... + mul v17.4S, v15.4S, v13.S[0] // ...............*...... + // gap // ...................... + mls v3.4S, v12.4S, v29.4S // ...................*.. + // gap // ...................... + str q21, [x0, #-32] // ..................*... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v17.4S, v28.4S, v29.4S // .................*.... + // gap // ...................... + str q3, [x0, #-48] // .....................* + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q17, [x0, #-16] // ....................*. + // gap // ...................... + + // original source code + // sub v17.4S, v2.4S, v17.4S // .*.................... + // sub v21.4S, v31.4S, v3.4S // *..................... + // mul v28.4S, v17.4S, v1.S[0] // .....*................ + // sqrdmulh v17.4S, v17.4S, v1.S[1] // ......*............... + // mul v6.4S, v21.4S, v1.S[2] // ...*.................. + // sqrdmulh v21.4S, v21.4S, v1.S[3] // ....*................. + // sub v31.4S, v15.4S, v26.4S // ..*................... + // mls v28.4S, v17.4S, v29.4S // ..........*........... + // mls v6.4S, v21.4S, v29.4S // ........*............. + // mul v1.4S, v31.4S, v13.S[0] // .........*............ + // sqrdmulh v31.4S, v31.4S, v13.S[1] // .......*.............. + // sub v26.4S, v28.4S, v6.4S // .............*........ + // add v28.4S, v28.4S, v6.4S // ............*......... + // mls v1.4S, v31.4S, v29.4S // ...........*.......... + // sqrdmulh v6.4S, v26.4S, v13.S[1] // ...............*...... + // mul v21.4S, v26.4S, v13.S[0] // ................*..... + // srshr v24.4S, v28.4S, #23 // ..............*....... + // mls v21.4S, v6.4S, v29.4S // ...................*.. + // str q1, [x0, #-32] // ..................*... + // mls v28.4S, v24.4S, v29.4S // .................*.... + // str q21, [x0, #-16] // .....................* + // str q28, [x0, #-48] // ....................*. + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q15, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q9, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q19, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + add v22.4S, v9.4S, v15.4S // ......................*................................................................................................................................................................................................................................................................. + ldr q20, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q11, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + sub v17.4S, v20.4S, v19.4S // ...............................*........................................................................................................................................................................................................................................................ + ldr q16, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + sqrdmulh v28.4S, v17.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + mul v24.4S, v17.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + sub v8.4S, v11.4S, v16.4S // ..........................*............................................................................................................................................................................................................................................................. + ldr q17, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + mul v13.4S, v8.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v8.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + ldr q21, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + mls v24.4S, v28.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + mls v13.4S, v14.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + add v28.4S, v21.4S, v17.4S // ...............................................*........................................................................................................................................................................................................................................ + sub v14.4S, v21.4S, v17.4S // ..............................................*......................................................................................................................................................................................................................................... + add v23.4S, v20.4S, v19.4S // ................................*....................................................................................................................................................................................................................................................... + sub v21.4S, v13.4S, v24.4S // .......................................................................*................................................................................................................................................................................................................ + add v20.4S, v11.4S, v16.4S // ...........................*............................................................................................................................................................................................................................................................ + add v13.4S, v13.4S, v24.4S // ........................................................................*............................................................................................................................................................................................................... + sqrdmulh v17.4S, v21.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + mul v11.4S, v21.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + sub v21.4S, v20.4S, v23.4S // ..................................................................*..................................................................................................................................................................................................................... + ldr q16, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + mls v11.4S, v17.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + ldr q18, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q17, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q12, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + add v24.4S, v18.4S, v17.4S // .....................................*.................................................................................................................................................................................................................................................. + sub v17.4S, v18.4S, v17.4S // ....................................*................................................................................................................................................................................................................................................... + add v8.4S, v12.4S, v16.4S // ..........................................*............................................................................................................................................................................................................................................. + sub v27.4S, v12.4S, v16.4S // .........................................*.............................................................................................................................................................................................................................................. + sqrdmulh v12.4S, v14.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + add v10.4S, v24.4S, v8.4S // .............................................................................*.......................................................................................................................................................................................................... + sub v16.4S, v24.4S, v8.4S // ............................................................................*........................................................................................................................................................................................................... + sub v8.4S, v9.4S, v15.4S // .....................*.................................................................................................................................................................................................................................................................. + mul v19.4S, v14.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v16.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mul v14.4S, v16.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + ldr q15, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + mls v19.4S, v12.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + mul v24.4S, v17.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + ldr q9, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + sqrdmulh v16.4S, v17.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + add v23.4S, v20.4S, v23.4S // ...................................................................*.................................................................................................................................................................................................................... + sub v12.4S, v15.4S, v9.4S // ...................................................*.................................................................................................................................................................................................................................... + add v15.4S, v15.4S, v9.4S // ....................................................*................................................................................................................................................................................................................................... + mls v24.4S, v16.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + mul v20.4S, v12.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + add v9.4S, v28.4S, v15.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v12.4S, v12.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + sqrdmulh v16.4S, v21.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + add v17.4S, v10.4S, v9.4S // .....................................................................................................................*.................................................................................................................................................................. + mls v20.4S, v12.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + mls v14.4S, v18.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + mul v12.4S, v21.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + sub v21.4S, v28.4S, v15.4S // ......................................................................................*................................................................................................................................................................................................. + ldr q28, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + mls v12.4S, v16.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + ldr q16, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + sqrdmulh v15.4S, v27.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + mul v27.4S, v27.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sub v18.4S, v16.4S, v28.4S // ................*....................................................................................................................................................................................................................................................................... + sub v10.4S, v10.4S, v9.4S // ....................................................................................................................*................................................................................................................................................................... + add v9.4S, v16.4S, v28.4S // .................*...................................................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v18.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + mul v16.4S, v18.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v8.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + mul v8.4S, v8.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + mls v27.4S, v15.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + mls v16.4S, v28.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sub v28.4S, v9.4S, v22.4S // ........................................................*............................................................................................................................................................................................................................... + mls v8.4S, v18.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + add v9.4S, v9.4S, v22.4S // .........................................................*.............................................................................................................................................................................................................................. + sqrdmulh v15.4S, v28.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + sub v18.4S, v24.4S, v27.4S // .................................................................................*...................................................................................................................................................................................................... + add v22.4S, v16.4S, v8.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v16.4S, v16.4S, v8.4S // .............................................................*.......................................................................................................................................................................................................................... + add v8.4S, v24.4S, v27.4S // ..................................................................................*..................................................................................................................................................................................................... + add v27.4S, v22.4S, v13.4S // ......................................................................................................*................................................................................................................................................................................. + sqrdmulh v24.4S, v16.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + sub v13.4S, v22.4S, v13.4S // .....................................................................................................*.................................................................................................................................................................................. + mul v16.4S, v16.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mul v22.4S, v28.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + sqrdmulh v28.4S, v13.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mul v13.4S, v13.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + mls v16.4S, v24.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + sqrdmulh v24.4S, v10.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mls v22.4S, v15.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + mls v13.4S, v28.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sub v15.4S, v16.4S, v11.4S // ...............................................................................................................*........................................................................................................................................................................ + add v28.4S, v16.4S, v11.4S // ................................................................................................................*....................................................................................................................................................................... + sub v11.4S, v19.4S, v20.4S // ...........................................................................................*............................................................................................................................................................................................ + sqrdmulh v16.4S, v15.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + mul v15.4S, v15.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + add v19.4S, v19.4S, v20.4S // ............................................................................................*........................................................................................................................................................................................... + mul v20.4S, v10.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + sqrdmulh v10.4S, v11.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mul v11.4S, v11.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + mls v15.4S, v16.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mls v20.4S, v24.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + add v24.4S, v8.4S, v19.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v19.4S, v8.4S, v19.4S // .........................................................................................................................*.............................................................................................................................................................. + sqrdmulh v8.4S, v21.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + mul v21.4S, v21.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + mls v11.4S, v10.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sqrdmulh v16.4S, v19.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + add v10.4S, v22.4S, v12.4S // ...........................................................................................................*............................................................................................................................................................................ + mls v21.4S, v8.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + add v8.4S, v9.4S, v23.4S // .................................................................................................*...................................................................................................................................................................................... + mul v19.4S, v19.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + sub v9.4S, v9.4S, v23.4S // ................................................................................................*....................................................................................................................................................................................... + sub v23.4S, v8.4S, v17.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v17.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v19.4S, v16.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sqrdmulh v16.4S, v23.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + mul v17.4S, v23.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sqrdmulh v23.4S, v9.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mul v9.4S, v9.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sub v22.4S, v22.4S, v12.4S // ..........................................................................................................*............................................................................................................................................................................. + mls v17.4S, v16.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + sub v12.4S, v27.4S, v24.4S // .............................................................................................................................................*.......................................................................................................................................... + mls v9.4S, v23.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + add v27.4S, v27.4S, v24.4S // ..............................................................................................................................................*......................................................................................................................................... + cmge v23.4S, v17.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v16.4S, v31.4S, v17.4S // ................................................................................................................................................................................*....................................................................................................... + sub v24.4S, v9.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... + sub v16.4S, v16.4S, v23.4S // ..................................................................................................................................................................................*..................................................................................................... + add v23.4S, v9.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... + sqrdmulh v20.4S, v12.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + mls v17.4S, v16.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v9.4S, v12.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + sqrdmulh v16.4S, v22.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mul v12.4S, v22.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + str q17, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v17.4S, v24.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v22.4S, v24.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + sub count, count, #1 +layer1234_start: + mul v24.4S, v18.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + sqrdmulh v18.4S, v18.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mls v12.4S, v16.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + sqrdmulh v16.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mul v8.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + mls v24.4S, v18.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + add v18.4S, v14.4S, v21.4S // ...............................................................................................................................*........................................................................................................................................................ + mls v9.4S, v20.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mls v8.4S, v16.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v16.4S, v24.4S, v11.4S // ...................................................................................................................................*.................................................................................................................................................... + add v11.4S, v24.4S, v11.4S // ....................................................................................................................................*................................................................................................................................................... + sub v20.4S, v14.4S, v21.4S // ..............................................................................................................................*......................................................................................................................................................... + mul v24.4S, v16.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v16.4S, v16.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + sub v21.4S, v28.4S, v11.4S // .......................................................................................................................................................*................................................................................................................................ + add v28.4S, v28.4S, v11.4S // ........................................................................................................................................................*............................................................................................................................... + cmge v14.4S, v8.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v24.4S, v16.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + cmge v16.4S, v31.4S, v8.4S // ................................................................................................................................................................................................................................................*....................................... + mul v11.4S, v20.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sub v14.4S, v16.4S, v14.4S // ..................................................................................................................................................................................................................................................*..................................... + add v16.4S, v15.4S, v24.4S // ............................................................................................................................................................................*........................................................................................................... + sub v15.4S, v15.4S, v24.4S // ...........................................................................................................................................................................*............................................................................................................ + sqrdmulh v20.4S, v20.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mul v24.4S, v21.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + mls v8.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v14.4S, v10.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v11.4S, v20.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + cmge v20.4S, v9.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + str q8, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v8.4S, v31.4S, v9.4S // ....................................................................................................................................................................................*................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + sub v8.4S, v8.4S, v20.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v20.4S, v23.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + mls v9.4S, v8.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mls v24.4S, v21.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sqrdmulh v8.4S, v16.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mul v16.4S, v16.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sqrdmulh v18.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mul v21.4S, v27.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sqrdmulh v27.4S, v27.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v16.4S, v8.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + mul v10.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mls v17.4S, v22.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mls v21.4S, v27.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + mul v8.4S, v23.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v10.4S, v18.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + cmge v18.4S, v16.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v27.4S, v31.4S, v16.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v23.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................*................................... + mls v8.4S, v20.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + cmge v20.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v27.4S, v27.4S, v18.4S // ..............................................................................................................................................................................................................................................................................*......... + sub v20.4S, v23.4S, v20.4S // ......................................................................................................................................................................................................................................................*................................. + ldr q18, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + ldr q22, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... + cmge v23.4S, v31.4S, v24.4S // ............................................................................................................................................................................................*........................................................................................... + str q9, [x1, #560] // .................................................................................................................................................................................................................*...................................................................... + add v9.4S, v22.4S, v18.4S // ...........................e............................................................................................................................................................................................................................................................ + sub v18.4S, v22.4S, v18.4S // ..........................e............................................................................................................................................................................................................................................................. + cmge v22.4S, v24.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mls v16.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + sub v23.4S, v23.4S, v22.4S // ..............................................................................................................................................................................................*......................................................................................... + mul v27.4S, v18.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + mls v21.4S, v20.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + add v22.4S, v12.4S, v11.4S // .......................................................................................................................................................................*................................................................................................................ + add v20.4S, v13.4S, v19.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v12.4S, v12.4S, v11.4S // ......................................................................................................................................................................*................................................................................................................. + str q21, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + mul v21.4S, v20.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + mul v11.4S, v12.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + str q16, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + sqrdmulh v20.4S, v20.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mul v16.4S, v14.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v12.4S, v12.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + sub v13.4S, v13.4S, v19.4S // .................................................................................................................................................................*...................................................................................................................... + mls v21.4S, v20.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sqrdmulh v20.4S, v14.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v14.4S, v15.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v19.4S, v28.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + sqrdmulh v18.4S, v18.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + mls v16.4S, v20.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + cmge v20.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + mls v24.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + ldr q23, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + mls v11.4S, v12.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + str q24, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + mul v24.4S, v28.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + cmge v28.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + sqrdmulh v12.4S, v13.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + sub v20.4S, v20.4S, v28.4S // ..........................................................................................................................................................................................................................................................*............................. + mls v24.4S, v19.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mul v19.4S, v13.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + mls v10.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sqrdmulh v20.4S, v22.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + cmge v28.4S, v31.4S, v24.4S // ............................................................................................................................................................................................................................................................*........................... + cmge v13.4S, v24.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v19.4S, v12.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + sub v12.4S, v28.4S, v13.4S // ..............................................................................................................................................................................................................................................................*......................... + sqrdmulh v15.4S, v15.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + cmge v13.4S, v16.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v24.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v28.4S, v31.4S, v19.4S // ....................................................................................................................................................................................................*................................................................................... + mls v14.4S, v15.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v15.4S, v19.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + str q24, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + sub v15.4S, v28.4S, v15.4S // ......................................................................................................................................................................................................*................................................................................. + cmge v24.4S, v17.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + cmge v28.4S, v31.4S, v17.4S // ................................................................................................................................................................................................*....................................................................................... + mls v19.4S, v15.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sub v12.4S, v28.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v15.4S, v31.4S, v14.4S // ............................................................................................................................................................................................................*........................................................................... + ldr q28, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + ldr q24, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + str q10, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + mls v17.4S, v12.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + add v10.4S, v24.4S, v28.4S // ................................e....................................................................................................................................................................................................................................................... + mls v27.4S, v18.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + sub v24.4S, v24.4S, v28.4S // ...............................e........................................................................................................................................................................................................................................................ + str q17, [x1, #752] // ....................................................................................................................................................................................................................*................................................................... + add v17.4S, v9.4S, v10.4S // ...................................................................e.................................................................................................................................................................................................................... + mul v18.4S, v24.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + sub v9.4S, v9.4S, v10.4S // ..................................................................e..................................................................................................................................................................................................................... + ldr q10, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + mul v12.4S, v22.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + str q19, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. + ldr q19, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v24.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + cmge v22.4S, v31.4S, v16.4S // ........................................................................................................................................................................................*............................................................................................... + add v24.4S, v19.4S, v10.4S // ......................e................................................................................................................................................................................................................................................................. + sub v13.4S, v22.4S, v13.4S // ..........................................................................................................................................................................................*............................................................................................. + mls v18.4S, v28.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + mls v12.4S, v20.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mls v16.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + cmge v20.4S, v31.4S, v8.4S // ................................................................................................................................................................................................................................................................*....................... + sub v22.4S, v27.4S, v18.4S // .......................................................................e................................................................................................................................................................................................................ + add v13.4S, v27.4S, v18.4S // ........................................................................e............................................................................................................................................................................................................... + sub v27.4S, v19.4S, v10.4S // .....................e.................................................................................................................................................................................................................................................................. + cmge v10.4S, v14.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + ldr q28, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + ldr q19, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... + sub v18.4S, v15.4S, v10.4S // ..............................................................................................................................................................................................................*......................................................................... + sqrdmulh v15.4S, v27.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + sub v10.4S, v28.4S, v19.4S // ................e....................................................................................................................................................................................................................................................................... + add v28.4S, v28.4S, v19.4S // .................e...................................................................................................................................................................................................................................................................... + mul v19.4S, v27.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + mul v27.4S, v10.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v10.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + mls v14.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + mls v19.4S, v15.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + str q16, [x1, #624] // ..................................................................................................................................................................................................................*..................................................................... + mls v27.4S, v10.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + sub v16.4S, v28.4S, v24.4S // ........................................................e............................................................................................................................................................................................................................... + add v24.4S, v28.4S, v24.4S // .........................................................e.............................................................................................................................................................................................................................. + cmge v18.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + add v28.4S, v27.4S, v19.4S // ..............................................................e......................................................................................................................................................................................................................... + sub v10.4S, v27.4S, v19.4S // .............................................................e.......................................................................................................................................................................................................................... + cmge v19.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + add v27.4S, v28.4S, v13.4S // ......................................................................................................e................................................................................................................................................................................. + sub v28.4S, v28.4S, v13.4S // .....................................................................................................e.................................................................................................................................................................................. + sub v13.4S, v18.4S, v19.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v15.4S, v31.4S, v11.4S // ........................................................................................................................................................................................................*............................................................................... + str q14, [x1, #944] // .......................................................................................................................................................................................................................*................................................................ + ldr q14, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + cmge v18.4S, v11.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + ldr q19, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + sub v18.4S, v15.4S, v18.4S // ..........................................................................................................................................................................................................*............................................................................. + mul v15.4S, v9.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + sqrdmulh v9.4S, v9.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + mls v11.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sub v18.4S, v19.4S, v23.4S // ....................................e................................................................................................................................................................................................................................................... + add v23.4S, v19.4S, v23.4S // .....................................e.................................................................................................................................................................................................................................................. + mls v15.4S, v9.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + cmge v19.4S, v8.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + str q11, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. + mul v11.4S, v22.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + sqrdmulh v22.4S, v22.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + sub v9.4S, v20.4S, v19.4S // ..................................................................................................................................................................................................................................................................*..................... + mul v19.4S, v10.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sqrdmulh v20.4S, v10.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + mls v8.4S, v9.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + sqrdmulh v9.4S, v18.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + sub v10.4S, v24.4S, v17.4S // ................................................................................................e....................................................................................................................................................................................... + mls v11.4S, v22.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + str q8, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + add v8.4S, v24.4S, v17.4S // .................................................................................................e...................................................................................................................................................................................... + mul v24.4S, v18.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + cmge v18.4S, v31.4S, v12.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v17.4S, v12.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mls v19.4S, v20.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + sub v20.4S, v18.4S, v17.4S // ..........................................................................................................................................................................................................................................................................*............. + ldr q18, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + ldr q17, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + mul v22.4S, v16.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + mls v12.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + add v20.4S, v17.4S, v18.4S // ..........................................e............................................................................................................................................................................................................................................. + sub v17.4S, v17.4S, v18.4S // .........................................e.............................................................................................................................................................................................................................................. + str q12, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + sqrdmulh v12.4S, v16.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + sqrdmulh v18.4S, v17.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + mul v17.4S, v17.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + mls v24.4S, v9.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + add v9.4S, v23.4S, v20.4S // .............................................................................e.......................................................................................................................................................................................................... + sub v16.4S, v23.4S, v20.4S // ............................................................................e........................................................................................................................................................................................................... + mls v17.4S, v18.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + ldr q20, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + ldr q23, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + sub v18.4S, v24.4S, v17.4S // .................................................................................e...................................................................................................................................................................................................... + add v17.4S, v24.4S, v17.4S // ..................................................................................e..................................................................................................................................................................................................... + ldr q24, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + mls v22.4S, v12.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sub v12.4S, v24.4S, v23.4S // ...................................................e.................................................................................................................................................................................................................................... + add v13.4S, v24.4S, v23.4S // ....................................................e................................................................................................................................................................................................................................... + add v23.4S, v20.4S, v14.4S // ...............................................e........................................................................................................................................................................................................................................ + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + sqrdmulh v24.4S, v12.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mul v12.4S, v12.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sub v21.4S, v23.4S, v13.4S // ......................................................................................e................................................................................................................................................................................................. + add v23.4S, v23.4S, v13.4S // .......................................................................................e................................................................................................................................................................................................ + sub v20.4S, v20.4S, v14.4S // ..............................................e......................................................................................................................................................................................................................................... + mul v13.4S, v28.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + mls v12.4S, v24.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mul v14.4S, v16.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + sqrdmulh v24.4S, v16.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + sqrdmulh v16.4S, v28.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + sub v28.4S, v9.4S, v23.4S // ....................................................................................................................e................................................................................................................................................................... + add v9.4S, v9.4S, v23.4S // .....................................................................................................................e.................................................................................................................................................................. + mls v14.4S, v24.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + sqrdmulh v24.4S, v28.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + sqrdmulh v23.4S, v10.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mls v13.4S, v16.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + mul v28.4S, v28.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + mul v16.4S, v10.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + add v10.4S, v22.4S, v15.4S // ...........................................................................................................e............................................................................................................................................................................ + sub v22.4S, v22.4S, v15.4S // ..........................................................................................................e............................................................................................................................................................................. + sub v15.4S, v8.4S, v9.4S // ........................................................................................................................................e............................................................................................................................................... + mls v28.4S, v24.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + mls v16.4S, v23.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + mul v24.4S, v15.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. + sqrdmulh v23.4S, v15.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ + sub v15.4S, v19.4S, v11.4S // ...............................................................................................................e........................................................................................................................................................................ + add v8.4S, v8.4S, v9.4S // .........................................................................................................................................e.............................................................................................................................................. + sub v9.4S, v16.4S, v28.4S // ............................................................................................................................................................e........................................................................................................................... + mls v24.4S, v23.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... + add v23.4S, v16.4S, v28.4S // .............................................................................................................................................................e.......................................................................................................................... + add v28.4S, v19.4S, v11.4S // ................................................................................................................e....................................................................................................................................................................... + sqrdmulh v16.4S, v15.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... + cmge v19.4S, v31.4S, v24.4S // ................................................................................................................................................................................e....................................................................................................... + cmge v11.4S, v24.4S, v30.4S // .................................................................................................................................................................................e...................................................................................................... + mul v15.4S, v15.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + sub v19.4S, v19.4S, v11.4S // ..................................................................................................................................................................................e..................................................................................................... + sqrdmulh v11.4S, v20.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + mul v20.4S, v20.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + mls v15.4S, v16.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... + sqrdmulh v16.4S, v22.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... + mls v24.4S, v19.4S, v29.4S // ...................................................................................................................................................................................e.................................................................................................... + mls v20.4S, v11.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + sqrdmulh v11.4S, v21.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + mul v21.4S, v21.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + str q24, [x1, #512] // ................................................................................................................................................................................................................e....................................................................... + add v19.4S, v20.4S, v12.4S // ............................................................................................e........................................................................................................................................................................................... + sub v20.4S, v20.4S, v12.4S // ...........................................................................................e............................................................................................................................................................................................ + mul v12.4S, v22.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... + mls v21.4S, v11.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + sub v24.4S, v17.4S, v19.4S // .........................................................................................................................e.............................................................................................................................................................. + mul v11.4S, v20.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + add v17.4S, v17.4S, v19.4S // ..........................................................................................................................e............................................................................................................................................................. + mul v19.4S, v24.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + sqrdmulh v22.4S, v24.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + sub v24.4S, v27.4S, v17.4S // .............................................................................................................................................e.......................................................................................................................................... + add v27.4S, v27.4S, v17.4S // ..............................................................................................................................................e......................................................................................................................................... + mul v17.4S, v9.4S, v0.S[0] // ..............................................................................................................................................................e......................................................................................................................... + sqrdmulh v20.4S, v20.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + mls v19.4S, v22.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... + sqrdmulh v22.4S, v9.4S, v0.S[1] // ...............................................................................................................................................................e........................................................................................................................ + mul v9.4S, v24.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + mls v11.4S, v20.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + sqrdmulh v20.4S, v24.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + + // original source code + // ldr q8, [x1, #0] // .....................................................................................e...........................................................................................................................................|...........................................................................................................................................e............................................................................ + // ldr q9, [x1, #(1*(512/8))] // ......................................................................................e..........................................................................................................................................|............................................................................................................................................e........................................................................... + // ldr q10, [x1, #(2*(512/8))] // ........................................................................e........................................................................................................................................................|..............................................................................................................................e......................................................................................... + // ldr q11, [x1, #(3*(512/8))] // .....................................................................e...........................................................................................................................................................|...........................................................................................................................e............................................................................................ + // ldr q12, [x1, #(4*(512/8))] // .e...............................................................................................................................................................................................................................|.......................................................e................................................................................................................................................................ + // ldr q13, [x1, #(5*(512/8))] // e................................................................................................................................................................................................................................|......................................................e................................................................................................................................................................. + // ldr q14, [x1, #(6*(512/8))] // ...........................................................e.....................................................................................................................................................................|.................................................................................................................e...................................................................................................... + // ldr q15, [x1, #(7*(512/8))] // ..........................................................e......................................................................................................................................................................|................................................................................................................e....................................................................................................... + // ldr q16, [x1, #(8*(512/8))] // ...............................................................................................................e.................................................................................................................|.....................................................................................................................................................................e.................................................. + // ldr q17, [x1, #(9*(512/8))] // ..............................e..................................................................................................................................................................................................|....................................................................................e................................................................................................................................... + // ldr q18, [x1, #(10*(512/8))] // ..........................................................................................................................................e......................................................................................|................................................................................................................................................................................................e....................... + // ldr q19, [x1, #(11*(512/8))] // .........................................................................................................................................e.......................................................................................|...............................................................................................................................................................................................e........................ + // ldr q20, [x1, #(12*(512/8))] // .......................................................................................................................................................e.........................................................................|.............................................................................................................................................................................................................e.......... + // ldr q21, [x1, #(13*(512/8))] // .............................................................................................................e...................................................................................................................|...................................................................................................................................................................e.................................................... + // ldr q22, [x1, #(14*(512/8))] // ...........................................................................................................................................................e.....................................................................|.................................................................................................................................................................................................................e...... + // ldr q23, [x1, #(15*(512/8))] // ........................................................................................................................................................e........................................................................|..............................................................................................................................................................................................................e......... + // sub v24.4s, v8.4s, v9.4s // .........................................................................................e.......................................................................................................................................|...............................................................................................................................................e........................................................................ + // add v8.4s, v8.4s, v9.4s // ..........................................................................................e......................................................................................................................................|................................................................................................................................................e....................................................................... + // mul v9.4s, v24.4s, v3.s[2] // ............................................................................................e....................................................................................................................................|..................................................................................................................................................e..................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[3] // .............................................................................................e...................................................................................................................................|...................................................................................................................................................e.................................................................... + // mls v9.4s, v24.4s, v29.4s // .................................................................................................e...............................................................................................................................|.......................................................................................................................................................e................................................................ + // sub v24.4s, v10.4s, v11.4s // ...................................................................................e.............................................................................................................................................|.........................................................................................................................................e.............................................................................. + // add v10.4s, v10.4s, v11.4s // ...........................................................................e.....................................................................................................................................................|.................................................................................................................................e...................................................................................... + // mul v11.4s, v24.4s, v4.s[0] // ...........................................................................................e.....................................................................................................................................|.................................................................................................................................................e...................................................................... + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ........................................................................................e........................................................................................................................................|..............................................................................................................................................e......................................................................... + // mls v11.4s, v24.4s, v29.4s // ...............................................................................................e.................................................................................................................................|.....................................................................................................................................................e.................................................................. + // sub v24.4s, v12.4s, v13.4s // .....e...........................................................................................................................................................................................................................|...........................................................e............................................................................................................................................................ + // add v12.4s, v12.4s, v13.4s // ....e............................................................................................................................................................................................................................|..........................................................e............................................................................................................................................................. + // mul v13.4s, v24.4s, v4.s[2] // .........e.......................................................................................................................................................................................................................|...............................................................e........................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.s[3] // ..........................e......................................................................................................................................................................................................|................................................................................e....................................................................................................................................... + // mls v13.4s, v24.4s, v29.4s // ...............................................................e.................................................................................................................................................................|.....................................................................................................................e.................................................................................................. + // sub v24.4s, v14.4s, v15.4s // ................................................................e................................................................................................................................................................|......................................................................................................................e................................................................................................. + // add v14.4s, v14.4s, v15.4s // ..............................................................e..................................................................................................................................................................|....................................................................................................................e................................................................................................... + // mul v15.4s, v24.4s, v5.s[0] // ...................................................................e.............................................................................................................................................................|.........................................................................................................................e.............................................................................................. + // sqrdmulh v24.4s, v24.4s, v5.s[1] // .........................................................................e.......................................................................................................................................................|...............................................................................................................................e........................................................................................ + // mls v15.4s, v24.4s, v29.4s // .............................................................................e...................................................................................................................................................|...................................................................................................................................e.................................................................................... + // sub v24.4s, v16.4s, v17.4s // ....................................................................................................................e............................................................................................................|..........................................................................................................................................................................e............................................. + // add v16.4s, v16.4s, v17.4s // .....................................................................................................................e...........................................................................................................|...........................................................................................................................................................................e............................................ + // mul v17.4s, v24.4s, v5.s[2] // ....................................................................................................................................e............................................................................................|..........................................................................................................................................................................................e............................. + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...............................................................................................................................e.................................................................................................|.....................................................................................................................................................................................e.................................. + // mls v17.4s, v24.4s, v29.4s // ...................................................................................................................................................e.............................................................................|.........................................................................................................................................................................................................e.............. + // sub v24.4s, v18.4s, v19.4s // ..............................................................................................................................................e..................................................................................|....................................................................................................................................................................................................e................... + // add v18.4s, v18.4s, v19.4s // .............................................................................................................................................e...................................................................................|...................................................................................................................................................................................................e.................... + // mul v19.4s, v24.4s, v6.s[0] // ..................................................................................................................................................e..............................................................................|........................................................................................................................................................................................................e............... + // sqrdmulh v24.4s, v24.4s, v6.s[1] // .................................................................................................................................................e...............................................................................|.......................................................................................................................................................................................................e................ + // mls v19.4s, v24.4s, v29.4s // ......................................................................................................................................................e..........................................................................|............................................................................................................................................................................................................e........... + // sub v24.4s, v20.4s, v21.4s // ......................................................................................................................................................................e..........................................................|........................................................................................................................................................................................................................ + // add v20.4s, v20.4s, v21.4s // ................................................................................................................................................................e................................................................|......................................................................................................................................................................................................................e. + // mul v21.4s, v24.4s, v6.s[2] // .......................................................................................................................................................................................................e.........................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ......................................................................................................................................................................................................e..........................|........................................................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ...........................................................................................................................................................................................................e.....................|........................................................................................................................................................................................................................ + // sub v24.4s, v22.4s, v23.4s // ..............................................................................................................................................................e..................................................................|....................................................................................................................................................................................................................e... + // add v22.4s, v22.4s, v23.4s // ...............................................................................................................................................................e.................................................................|.....................................................................................................................................................................................................................e.. + // mul v23.4s, v24.4s, v7.s[0] // ...................................................................................................................................................................e.............................................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ..................................................................................................................................................................e..............................................................|........................................................................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ........................................................................................................................................................................e........................................................|........................................................................................................................................................................................................................ + // sub v24.4s, v8.4s, v10.4s // ..................................................................................................e..............................................................................................................................|........................................................................................................................................................e............................................................... + // add v8.4s, v8.4s, v10.4s // ...................................................................................................e.............................................................................................................................|.........................................................................................................................................................e.............................................................. + // mul v10.4s, v24.4s, v1.s[2] // ...........................................................................................................................................e.....................................................................................|.................................................................................................................................................................................................e...................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................................................................................................................................................e................................................................................|......................................................................................................................................................................................................e................. + // mls v10.4s, v24.4s, v29.4s // ............................................................................................................................................................e....................................................................|..................................................................................................................................................................................................................e..... + // sub v24.4s, v9.4s, v11.4s // ......................................................................................................e..........................................................................................................................|............................................................................................................................................................e........................................................... + // add v9.4s, v9.4s, v11.4s // .....................................................................................................e...........................................................................................................................|...........................................................................................................................................................e............................................................ + // mul v11.4s, v24.4s, v1.s[2] // ............................................................................................................................e....................................................................................................|..................................................................................................................................................................................e..................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................................................................................................e...................................................................................................|...................................................................................................................................................................................e.................................... + // mls v11.4s, v24.4s, v29.4s // .......................................................................................................................................e.........................................................................................|.............................................................................................................................................................................................e.......................... + // sub v24.4s, v12.4s, v14.4s // ....................................................................e............................................................................................................................................................|..........................................................................................................................e............................................................................................. + // add v12.4s, v12.4s, v14.4s // ..................................................................e..............................................................................................................................................................|........................................................................................................................e............................................................................................... + // mul v14.4s, v24.4s, v2.s[0] // .................................................................................................................e...............................................................................................................|.......................................................................................................................................................................e................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..................................................................................................................e..............................................................................................................|........................................................................................................................................................................e............................................... + // mls v14.4s, v24.4s, v29.4s // ......................................................................................................................e..........................................................................................................|............................................................................................................................................................................e........................................... + // sub v24.4s, v13.4s, v15.4s // .................................................................................e...............................................................................................................................................|.......................................................................................................................................e................................................................................ + // add v13.4s, v13.4s, v15.4s // ..................................................................................e..............................................................................................................................................|........................................................................................................................................e............................................................................... + // mul v15.4s, v24.4s, v2.s[0] // .........................................................................................................................e.......................................................................................................|...............................................................................................................................................................................e........................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................................................................e......................................................................................................|................................................................................................................................................................................e....................................... + // mls v15.4s, v24.4s, v29.4s // .................................................................................................................................e...............................................................................................|.......................................................................................................................................................................................e................................ + // sub v24.4s, v16.4s, v18.4s // .....................................................................................................................................................e...........................................................................|...........................................................................................................................................................................................................e............ + // add v16.4s, v16.4s, v18.4s // ....................................................................................................................................................e............................................................................|..........................................................................................................................................................................................................e............. + // mul v18.4s, v24.4s, v2.s[2] // .........................................................................................................................................................................e.......................................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........................................................................................................................................................................e......................................................|........................................................................................................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // ..............................................................................................................................................................................e..................................................|........................................................................................................................................................................................................................ + // sub v24.4s, v17.4s, v19.4s // .........................................................................................................................................................e.......................................................................|...............................................................................................................................................................................................................e........ + // add v17.4s, v17.4s, v19.4s // ..........................................................................................................................................................e......................................................................|................................................................................................................................................................................................................e....... + // mul v19.4s, v24.4s, v2.s[2] // .................................................................................................................................................................................................................................*........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .................................................................................................................................................................................................................................|*....................................................................................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|....*................................................................................................................................................................................................................... + // sub v24.4s, v20.4s, v22.4s // ....................................................................................................................................................................e............................................................|........................................................................................................................................................................................................................ + // add v20.4s, v20.4s, v22.4s // .....................................................................................................................................................................e...........................................................|........................................................................................................................................................................................................................ + // mul v22.4s, v24.4s, v3.s[0] // .............................................................................................................................................................................................................e...................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ............................................................................................................................................................................................................e....................|........................................................................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................................e..............|........................................................................................................................................................................................................................ + // sub v24.4s, v21.4s, v23.4s // ................................................................................................................................................................................................................e................|........................................................................................................................................................................................................................ + // add v21.4s, v21.4s, v23.4s // ...............................................................................................................................................................................................................e.................|........................................................................................................................................................................................................................ + // mul v23.4s, v24.4s, v3.s[0] // ....................................................................................................................................................................................................................e............|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...........................................................................................................................................................................................................................e.....|........................................................................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ...............................................................................................................................................................................................................................e.|........................................................................................................................................................................................................................ + // sub v24.4s, v8.4s, v12.4s // ................................................................................................................................e................................................................................................|......................................................................................................................................................................................e................................. + // add v8.4s, v8.4s, v12.4s // ...................................................................................................................................e.............................................................................................|.........................................................................................................................................................................................e.............................. + // mul v12.4s, v24.4s, v0.s[2] // ...................................................................................................................................................................................e.............................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................................................................................................................e................................................|........................................................................................................................................................................................................................ + // mls v12.4s, v24.4s, v29.4s // ........................................................................................................................................................................................e........................................|........................................................................................................................................................................................................................ + // sub v24.4s, v9.4s, v13.4s // .........................................................................................................e.......................................................................................................................|...............................................................................................................................................................e........................................................ + // add v9.4s, v9.4s, v13.4s // ........................................................................................................e........................................................................................................................|..............................................................................................................................................................e......................................................... + // mul v13.4s, v24.4s, v0.s[2] // .......................................................................................................................................................................e.........................................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................................................................................................................................................................e.....................................................|........................................................................................................................................................................................................................ + // mls v13.4s, v24.4s, v29.4s // .................................................................................................................................................................................e...............................................|........................................................................................................................................................................................................................ + // sub v24.4s, v10.4s, v14.4s // .....................................................................................................................................................................................e...........................................|........................................................................................................................................................................................................................ + // add v10.4s, v10.4s, v14.4s // ....................................................................................................................................................................................e............................................|........................................................................................................................................................................................................................ + // mul v14.4s, v24.4s, v0.s[2] // .................................................................................................................................................................................................................e...............|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................................................................................................e.......................|........................................................................................................................................................................................................................ + // mls v14.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|.*...................................................................................................................................................................................................................... + // sub v24.4s, v11.4s, v15.4s // ...........................................................................................................................................................................................e.....................................|........................................................................................................................................................................................................................ + // add v11.4s, v11.4s, v15.4s // ................................................................................................................................................................................................e................................|........................................................................................................................................................................................................................ + // mul v15.4s, v24.4s, v0.s[2] // ....................................................................................................................................................................................................e............................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .................................................................................................................................................................................................e...............................|........................................................................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // ........................................................................................................................................................................................................e........................|........................................................................................................................................................................................................................ + // sub v24.4s, v16.4s, v20.4s // ............................................................................................................................................................................e....................................................|........................................................................................................................................................................................................................ + // add v16.4s, v16.4s, v20.4s // .............................................................................................................................................................................e...................................................|........................................................................................................................................................................................................................ + // mul v20.4s, v24.4s, v1.s[0] // ..................................................................................................................................................................................e..............................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...............................................................................................................................................................................e.................................................|........................................................................................................................................................................................................................ + // mls v20.4s, v24.4s, v29.4s // .......................................................................................................................................................................................e.........................................|........................................................................................................................................................................................................................ + // sub v24.4s, v17.4s, v21.4s // ...................................................................................................................................................................................................................e.............|........................................................................................................................................................................................................................ + // add v17.4s, v17.4s, v21.4s // .....................................................................................................................................................................................................................e...........|........................................................................................................................................................................................................................ + // mul v21.4s, v24.4s, v1.s[0] // ......................................................................................................................................................................................................................e..........|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .......................................................................................................................................................................................................................e.........|........................................................................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................e....|........................................................................................................................................................................................................................ + // sub v24.4s, v18.4s, v22.4s // .................................................................................................................................................................................................................................|..........*............................................................................................................................................................................................................. + // add v18.4s, v18.4s, v22.4s // .................................................................................................................................................................................................................................|.....*.................................................................................................................................................................................................................. + // mul v22.4s, v24.4s, v1.s[0] // .................................................................................................................................................................................................................................|..................*..................................................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................................................................................................................................|......................*................................................................................................................................................................................................. + // mls v22.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|..........................*............................................................................................................................................................................................. + // sub v24.4s, v19.4s, v23.4s // .................................................................................................................................................................................................................................|........*............................................................................................................................................................................................................... + // add v19.4s, v19.4s, v23.4s // .................................................................................................................................................................................................................................|.........*.............................................................................................................................................................................................................. + // mul v23.4s, v24.4s, v1.s[0] // .................................................................................................................................................................................................................................|...........*............................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................................................................................................................................|............*........................................................................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|................*....................................................................................................................................................................................................... + // sub v24.4s, v8.4s, v16.4s // ......................................................................................................................................................................................e..........................................|........................................................................................................................................................................................................................ + // add v8.4s, v8.4s, v16.4s // ............................................................................................................................................................................................e....................................|........................................................................................................................................................................................................................ + // mul v16.4s, v24.4s, v0.s[0] // .........................................................................................................................................................................................e.......................................|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................................................................................................................................................................................e......................................|........................................................................................................................................................................................................................ + // mls v16.4s, v24.4s, v29.4s // ..............................................................................................................................................................................................e..................................|........................................................................................................................................................................................................................ + // sub v24.4s, v9.4s, v17.4s // ........................................................................................................................................................................................................................e........|........................................................................................................................................................................................................................ + // add v9.4s, v9.4s, v17.4s // .........................................................................................................................................................................................................................e.......|........................................................................................................................................................................................................................ + // mul v17.4s, v24.4s, v0.s[0] // ..............................................................................................................................................................................................................................e..|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................................................................................................................................................................................e|........................................................................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|......*................................................................................................................................................................................................................. + // sub v24.4s, v10.4s, v18.4s // .................................................................................................................................................................................................................................|.........................*.............................................................................................................................................................................................. + // add v10.4s, v10.4s, v18.4s // .................................................................................................................................................................................................................................|.................................*...................................................................................................................................................................................... + // mul v18.4s, v24.4s, v0.s[0] // ...................*.............................................................................................................................................................................................................|.........................................................................*.............................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................*.........................................................................................................................................................................................................|.............................................................................*.......................................................................................................................................... + // mls v18.4s, v24.4s, v29.4s // ...........................*.....................................................................................................................................................................................................|.................................................................................*...................................................................................................................................... + // sub v24.4s, v11.4s, v19.4s // .................................................................................................................................................................................................................................|.............*.......................................................................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // .................................................................................................................................................................................................................................|..............*......................................................................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // .................................................................................................................................................................................................................................|.......................*................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................................................................................................................................................................................................|..............................*......................................................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|...................................*.................................................................................................................................................................................... + // sub v24.4s, v12.4s, v20.4s // .............................................................................................................................................................................................e...................................|........................................................................................................................................................................................................................ + // add v12.4s, v12.4s, v20.4s // ...............................................................................................................................................................................................e.................................|........................................................................................................................................................................................................................ + // mul v20.4s, v24.4s, v0.s[0] // ..........................................................................................................................................................................................................................e......|........................................................................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................................................................................................................................................................e...|........................................................................................................................................................................................................................ + // mls v20.4s, v24.4s, v29.4s // .................................................................................................................................................................................................................................|...........................................*............................................................................................................................................................................ + // sub v24.4s, v13.4s, v21.4s // .....................*...........................................................................................................................................................................................................|...........................................................................*............................................................................................................................................ + // add v13.4s, v13.4s, v21.4s // ............*....................................................................................................................................................................................................................|..................................................................*..................................................................................................................................................... + // mul v21.4s, v24.4s, v0.s[0] // ......................................*..........................................................................................................................................................................................|............................................................................................*........................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................*.............................................................................................................................................................................................|.........................................................................................*.............................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // ...........................................*.....................................................................................................................................................................................|.................................................................................................*...................................................................................................................... + // sub v24.4s, v14.4s, v22.4s // .............*...................................................................................................................................................................................................................|...................................................................*.................................................................................................................................................... + // add v14.4s, v14.4s, v22.4s // ...........*.....................................................................................................................................................................................................................|.................................................................*...................................................................................................................................................... + // mul v22.4s, v24.4s, v0.s[0] // ................*................................................................................................................................................................................................................|......................................................................*................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................*............................................................................................................................................................................................................|..........................................................................*............................................................................................................................................. + // mls v22.4s, v24.4s, v29.4s // ...............................*.................................................................................................................................................................................................|.....................................................................................*.................................................................................................................................. + // sub v24.4s, v15.4s, v23.4s // .................................................................................................................................................................................................................................|.....................*.................................................................................................................................................................................................. + // add v15.4s, v15.4s, v23.4s // .................................................................................................................................................................................................................................|....................*................................................................................................................................................................................................... + // mul v23.4s, v24.4s, v0.s[0] // ........................*........................................................................................................................................................................................................|..............................................................................*......................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................*...................................................................................................................................................................................|...................................................................................................*.................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // .................................................*...............................................................................................................................................................................|.......................................................................................................*................................................................................................................ + // cmge v27.4s, v31.4s, v16.4s // ..................................................................................................................................................................................................e..............................|........................................................................................................................................................................................................................ + // cmge v28.4s, v16.4s, v30.4s // ...................................................................................................................................................................................................e.............................|........................................................................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .....................................................................................................................................................................................................e...........................|........................................................................................................................................................................................................................ + // mls v16.4s, v28.4s, v29.4s // ..........................................................................................................................................................................................................e......................|........................................................................................................................................................................................................................ + // cmge v27.4s, v31.4s, v17.4s // .................................................................................................................................................................................................................................|.............................*.......................................................................................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // .................................................................................................................................................................................................................................|...........................*............................................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................................................................|...............................*........................................................................................................................................................................................ + // mls v17.4s, v28.4s, v29.4s // .................................................................................................................................................................................................................................|..................................*..................................................................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ..........................................................................*......................................................................................................................................................|................................................................................................................................*....................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ..............................................*..................................................................................................................................................................................|....................................................................................................*................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................*....................................................................................................................................................|..................................................................................................................................*..................................................................................... + // mls v18.4s, v28.4s, v29.4s // ...............................................................................*.................................................................................................................................................|.....................................................................................................................................*.................................................................................. + // cmge v27.4s, v31.4s, v19.4s // ..*..............................................................................................................................................................................................................................|........................................................*............................................................................................................................................................... + // cmge v28.4s, v19.4s, v30.4s // ......*..........................................................................................................................................................................................................................|............................................................*........................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ........*........................................................................................................................................................................................................................|..............................................................*......................................................................................................................................................... + // mls v19.4s, v28.4s, v29.4s // .............................*...................................................................................................................................................................................................|...................................................................................*.................................................................................................................................... + // cmge v27.4s, v31.4s, v20.4s // ......................................................*..........................................................................................................................................................................|............................................................................................................*........................................................................................................... + // cmge v28.4s, v20.4s, v30.4s // .....................................................*...........................................................................................................................................................................|...........................................................................................................*............................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ........................................................*........................................................................................................................................................................|..............................................................................................................*......................................................................................................... + // mls v20.4s, v28.4s, v29.4s // .............................................................*...................................................................................................................................................................|...................................................................................................................*.................................................................................................... + // cmge v27.4s, v31.4s, v21.4s // ................................................*................................................................................................................................................................................|......................................................................................................*................................................................................................................. + // cmge v28.4s, v21.4s, v30.4s // ..................................................*..............................................................................................................................................................................|........................................................................................................*............................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ....................................................*............................................................................................................................................................................|..........................................................................................................*............................................................................................................. + // mls v21.4s, v28.4s, v29.4s // .......................................................*.........................................................................................................................................................................|.............................................................................................................*.......................................................................................................... + // cmge v27.4s, v31.4s, v22.4s // ...........................................................................................................*.....................................................................................................................|.................................................................................................................................................................*...................................................... + // cmge v28.4s, v22.4s, v30.4s // ..............................................................................................................*..................................................................................................................|....................................................................................................................................................................*................................................... + // sub v28.4s, v27.4s, v28.4s // ................................................................................................................*................................................................................................................|......................................................................................................................................................................*................................................. + // mls v22.4s, v28.4s, v29.4s // ...................................................................................................................*.............................................................................................................|.........................................................................................................................................................................*.............................................. + // cmge v27.4s, v31.4s, v23.4s // .........................................................*.......................................................................................................................................................................|...............................................................................................................*........................................................................................................ + // cmge v28.4s, v23.4s, v30.4s // ....................................................................................*............................................................................................................................................|..........................................................................................................................................*............................................................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................*.........................................................................................................................................|.............................................................................................................................................*.......................................................................... + // mls v23.4s, v28.4s, v29.4s // ..............................................................................................*..................................................................................................................................|....................................................................................................................................................*................................................................... + // str q16, [x1, #(8*(512/8))] // ..............................................................................................................................................................................................................e..................|........................................................................................................................................................................................................................ + // str q17, [x1, #(9*(512/8))] // ...*.............................................................................................................................................................................................................................|.........................................................*.............................................................................................................................................................. + // str q18, [x1, #(10*(512/8))] // ................................................................................................*................................................................................................................................|......................................................................................................................................................*................................................................. + // str q19, [x1, #(11*(512/8))] // ................................*................................................................................................................................................................................................|......................................................................................*................................................................................................................................. + // str q20, [x1, #(12*(512/8))] // .................................................................*...............................................................................................................................................................|.......................................................................................................................*................................................................................................ + // str q21, [x1, #(13*(512/8))] // .......................................................................*.........................................................................................................................................................|.............................................................................................................................*.......................................................................................... + // str q22, [x1, #(14*(512/8))] // ........................................................................................................................*........................................................................................................|..............................................................................................................................................................................*......................................... + // str q23, [x1, #(15*(512/8))] // ............................................................................................................*....................................................................................................................|..................................................................................................................................................................*..................................................... + // mul v16.4s, v8.4s, v25.4s // .................................................................................................................................................................................................................................|...*.................................................................................................................................................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // .................................................................................................................................................................................................................................|..*..................................................................................................................................................................................................................... + // mls v16.4s, v8.4s, v29.4s // .................................................................................................................................................................................................................................|.......*................................................................................................................................................................................................................ + // mul v17.4s, v9.4s, v25.4s // .................................................................................................................................................................................................................................|.......................................*................................................................................................................................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // .................................................................................................................................................................................................................................|........................................*............................................................................................................................................................................... + // mls v17.4s, v9.4s, v29.4s // .................................................................................................................................................................................................................................|............................................*........................................................................................................................................................................... + // mul v18.4s, v10.4s, v25.4s // .................................................................................................................................................................................................................................|..........................................*............................................................................................................................................................................. + // sqrdmulh v10.4s, v10.4s, v26.4s // .................................................................................................................................................................................................................................|......................................*................................................................................................................................................................................. + // mls v18.4s, v10.4s, v29.4s // .................................................................................................................................................................................................................................|..............................................*......................................................................................................................................................................... + // mul v19.4s, v11.4s, v25.4s // .................................*...............................................................................................................................................................................................|.......................................................................................*................................................................................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // .........................*.......................................................................................................................................................................................................|...............................................................................*........................................................................................................................................ + // mls v19.4s, v11.4s, v29.4s // .....................................*...........................................................................................................................................................................................|...........................................................................................*............................................................................................................................ + // mul v20.4s, v12.4s, v25.4s // .................................................................................................................................................................................................................................|.............................................*.......................................................................................................................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // .................................................................................................................................................................................................................................|................................*....................................................................................................................................................................................... + // mls v20.4s, v12.4s, v29.4s // .................................................................................................................................................................................................................................|..................................................*..................................................................................................................................................................... + // mul v21.4s, v13.4s, v25.4s // ...............*.................................................................................................................................................................................................................|.....................................................................*.................................................................................................................................................. + // sqrdmulh v13.4s, v13.4s, v26.4s // ..................*..............................................................................................................................................................................................................|........................................................................*............................................................................................................................................... + // mls v21.4s, v13.4s, v29.4s // ......................*..........................................................................................................................................................................................................|............................................................................*........................................................................................................................................... + // mul v22.4s, v14.4s, v25.4s // ......................................................................*..........................................................................................................................................................|............................................................................................................................*........................................................................................... + // sqrdmulh v14.4s, v14.4s, v26.4s // ........................................*........................................................................................................................................................................................|..............................................................................................*......................................................................................................................... + // mls v22.4s, v14.4s, v29.4s // ..............................................................................*..................................................................................................................................................|....................................................................................................................................*................................................................................... + // mul v23.4s, v15.4s, v25.4s // .................................................................................................................................................................................................................................|.....................................*.................................................................................................................................................................................. + // sqrdmulh v15.4s, v15.4s, v26.4s // .................................................................................................................................................................................................................................|....................................*................................................................................................................................................................................... + // mls v23.4s, v15.4s, v29.4s // .................................................................................................................................................................................................................................|.........................................*.............................................................................................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // .................................................................................................................................................................................................................................|.................*...................................................................................................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // .................................................................................................................................................................................................................................|...............*........................................................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................................................................|...................*.................................................................................................................................................................................................... + // mls v16.4s, v28.4s, v29.4s // .................................................................................................................................................................................................................................|........................*............................................................................................................................................................................................... + // cmge v27.4s, v31.4s, v17.4s // .................................................................................................................................................................................................................................|.................................................*...................................................................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // .................................................................................................................................................................................................................................|...................................................*.................................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................................................................|.....................................................*.................................................................................................................................................................. + // mls v17.4s, v28.4s, v29.4s // ..........*......................................................................................................................................................................................................................|................................................................*....................................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ............................*....................................................................................................................................................................................................|..................................................................................*..................................................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ..................................*..............................................................................................................................................................................................|........................................................................................*............................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ....................................*............................................................................................................................................................................................|..........................................................................................*............................................................................................................................. + // mls v18.4s, v28.4s, v29.4s // .......................................*.........................................................................................................................................................................................|.............................................................................................*.......................................................................................................................... + // cmge v27.4s, v31.4s, v19.4s // .........................................*.......................................................................................................................................................................................|...............................................................................................*........................................................................................................................ + // cmge v28.4s, v19.4s, v30.4s // ..........................................*......................................................................................................................................................................................|................................................................................................*....................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................*....................................................................................................................................................................................|..................................................................................................*..................................................................................................................... + // mls v19.4s, v28.4s, v29.4s // ...............................................*.................................................................................................................................................................................|.....................................................................................................*.................................................................................................................. + // cmge v27.4s, v31.4s, v20.4s // ................................................................................*................................................................................................................................................|......................................................................................................................................*................................................................................. + // cmge v28.4s, v20.4s, v30.4s // .......................................................................................................................*.........................................................................................................|.............................................................................................................................................................................*.......................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................................................................................................*.....................................................................................................|.................................................................................................................................................................................*...................................... + // mls v20.4s, v28.4s, v29.4s // ..............................................................................................................................*..................................................................................................|....................................................................................................................................................................................*................................... + // cmge v27.4s, v31.4s, v21.4s // ....................................................................................................*............................................................................................................................|..........................................................................................................................................................*............................................................. + // cmge v28.4s, v21.4s, v30.4s // .......................................................................................................*.........................................................................................................................|.............................................................................................................................................................*.......................................................... + // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................*......................................................................................................................|................................................................................................................................................................*....................................................... + // mls v21.4s, v28.4s, v29.4s // .............................................................................................................................................................*...................................................................|...................................................................................................................................................................................................................*.... + // cmge v27.4s, v31.4s, v22.4s // .....................................................................................................................................*...........................................................................................|...........................................................................................................................................................................................*............................ + // cmge v28.4s, v22.4s, v30.4s // ......................................................................................................................................*..........................................................................................|............................................................................................................................................................................................*........................... + // sub v28.4s, v27.4s, v28.4s // ........................................................................................................................................*........................................................................................|..............................................................................................................................................................................................*......................... + // mls v22.4s, v28.4s, v29.4s // ............................................................................................................................................*....................................................................................|..................................................................................................................................................................................................*..................... + // cmge v27.4s, v31.4s, v23.4s // .................................................................................................................................................................................................................................|................................................*....................................................................................................................................................................... + // cmge v28.4s, v23.4s, v30.4s // .................................................................................................................................................................................................................................|...............................................*........................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .................................................................................................................................................................................................................................|....................................................*................................................................................................................................................................... + // mls v23.4s, v28.4s, v29.4s // .......*.........................................................................................................................................................................................................................|.............................................................*.......................................................................................................................................................... + // str q16, [x1], #(16) // .................................................................................................................................................................................................................................|............................*........................................................................................................................................................................................... + // str q17, [x1, #(-16 + 1*(512/8))] // ..............*..................................................................................................................................................................................................................|....................................................................*................................................................................................................................................... + // str q18, [x1, #(-16 + 2*(512/8))] // ............................................................*....................................................................................................................................................................|..................................................................................................................*..................................................................................................... + // str q19, [x1, #(-16 + 3*(512/8))] // ...................................................*.............................................................................................................................................................................|.........................................................................................................*.............................................................................................................. + // str q20, [x1, #(-16 + 4*(512/8))] // ..................................................................................................................................*..............................................................................................|........................................................................................................................................................................................*............................... + // str q21, [x1, #(-16 + 5*(512/8))] // .................................................................................................................................................................*...............................................................|.......................................................................................................................................................................................................................* + // str q22, [x1, #(-16 + 6*(512/8))] // ...............................................................................................................................................*.................................................................................|.....................................................................................................................................................................................................*.................. + // str q23, [x1, #(-16 + 7*(512/8))] // .................*...............................................................................................................................................................................................................|.......................................................................*................................................................................................................................................ + + sub count, count, #1 + cbnz count, layer1234_start + mls v12.4S, v16.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v24.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mul v8.4S, v18.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + sqrdmulh v18.4S, v18.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mls v9.4S, v20.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mls v16.4S, v24.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sqrdmulh v24.4S, v27.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v8.4S, v18.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + cmge v18.4S, v9.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mls v17.4S, v22.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mul v27.4S, v27.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sub v22.4S, v8.4S, v11.4S // ...................................................................................................................................*.................................................................................................................................................... + add v8.4S, v8.4S, v11.4S // ....................................................................................................................................*................................................................................................................................................... + cmge v11.4S, v31.4S, v9.4S // ....................................................................................................................................................................................*................................................................................................... + mul v20.4S, v22.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v22.4S, v22.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mls v27.4S, v24.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + add v24.4S, v28.4S, v8.4S // ........................................................................................................................................................*............................................................................................................................... + sub v11.4S, v11.4S, v18.4S // ......................................................................................................................................................................................*................................................................................................. + mls v20.4S, v22.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + add v18.4S, v14.4S, v21.4S // ...............................................................................................................................*........................................................................................................................................................ + mls v9.4S, v11.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + add v22.4S, v13.4S, v19.4S // ..................................................................................................................................................................*..................................................................................................................... + add v11.4S, v15.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... + sub v15.4S, v15.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ + str q9, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mul v9.4S, v11.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sub v13.4S, v13.4S, v19.4S // .................................................................................................................................................................*...................................................................................................................... + sub v20.4S, v14.4S, v21.4S // ..............................................................................................................................*......................................................................................................................................................... + sub v21.4S, v28.4S, v8.4S // .......................................................................................................................................................*................................................................................................................................ + mul v8.4S, v13.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + sqrdmulh v19.4S, v13.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + sqrdmulh v14.4S, v24.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + sqrdmulh v28.4S, v15.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mul v13.4S, v15.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + mls v8.4S, v19.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + mul v19.4S, v24.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v15.4S, v11.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v13.4S, v28.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v11.4S, v8.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v28.4S, v31.4S, v8.4S // ....................................................................................................................................................................................................*................................................................................... + mls v19.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v28.4S, v28.4S, v11.4S // ......................................................................................................................................................................................................*................................................................................. + cmge v24.4S, v31.4S, v13.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v14.4S, v13.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + mls v8.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sub v28.4S, v24.4S, v14.4S // ..............................................................................................................................................................................................................*......................................................................... + mls v9.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v14.4S, v31.4S, v17.4S // ................................................................................................................................................................................................*....................................................................................... + mls v13.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v24.4S, v9.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v15.4S, v31.4S, v9.4S // ............................................................................................................................................................................................................................................................................*........... + str q13, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + sqrdmulh v13.4S, v23.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + cmge v11.4S, v31.4S, v27.4S // ....................................................................................................................................................................................................................................................*................................... + str q8, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + cmge v8.4S, v27.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v24.4S, v15.4S, v24.4S // ..............................................................................................................................................................................................................................................................................*......... + sub v15.4S, v11.4S, v8.4S // ......................................................................................................................................................................................................................................................*................................. + mul v11.4S, v20.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sqrdmulh v20.4S, v20.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mls v9.4S, v24.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + mls v27.4S, v15.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sub v15.4S, v10.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mls v11.4S, v20.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + add v8.4S, v10.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + str q27, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + sqrdmulh v20.4S, v15.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + sub v24.4S, v12.4S, v11.4S // ......................................................................................................................................................................*................................................................................................................. + add v12.4S, v12.4S, v11.4S // .......................................................................................................................................................................*................................................................................................................ + mul v18.4S, v15.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v27.4S, v24.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v11.4S, v24.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v15.4S, v12.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mul v12.4S, v12.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v18.4S, v20.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mls v11.4S, v27.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v10.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v12.4S, v15.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v24.4S, v31.4S, v11.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v15.4S, v11.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mul v20.4S, v21.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sub v28.4S, v28.4S, v10.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v10.4S, v17.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mul v23.4S, v23.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v16.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v28.4S, v14.4S, v10.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v14.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v23.4S, v13.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v13.4S, v27.4S, v14.4S // ..........................................................................................................................................................................................*............................................................................................. + mls v17.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sqrdmulh v27.4S, v22.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mls v18.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + mul v22.4S, v22.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + cmge v28.4S, v31.4S, v12.4S // ........................................................................................................................................................................................................................................................................*............... + sqrdmulh v14.4S, v8.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mul v10.4S, v8.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mls v22.4S, v27.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v8.4S, v23.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + cmge v27.4S, v31.4S, v23.4S // ................................................................................................................................................................................................................................................................*....................... + mls v10.4S, v14.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + cmge v14.4S, v31.4S, v22.4S // ....................................................................................................................................................................................................................................................................*................... + cmge v13.4S, v22.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + sub v27.4S, v27.4S, v8.4S // ..................................................................................................................................................................................................................................................................*..................... + sub v14.4S, v14.4S, v13.4S // ......................................................................................................................................................................................................................................................................*................. + sqrdmulh v21.4S, v21.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + cmge v8.4S, v31.4S, v10.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v13.4S, v10.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + mls v22.4S, v14.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sub v13.4S, v8.4S, v13.4S // ..........................................................................................................................................................................................................................................................*............................. + mls v20.4S, v21.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sub v15.4S, v24.4S, v15.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v23.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + str q22, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + cmge v24.4S, v20.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + cmge v27.4S, v31.4S, v20.4S // ............................................................................................................................................................................................*........................................................................................... + str q18, [x1, #624] // ..................................................................................................................................................................................................................*..................................................................... + sub v21.4S, v27.4S, v24.4S // ..............................................................................................................................................................................................*......................................................................................... + str q23, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v11.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mls v20.4S, v21.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v10.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + str q9, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + cmge v9.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + str q20, [x1, #688] // ...................................................................................................................................................................................................................*.................................................................... + cmge v20.4S, v12.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q10, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v28.4S, v28.4S, v20.4S // ..........................................................................................................................................................................................................................................................................*............. + cmge v20.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + str q17, [x1, #752] // ....................................................................................................................................................................................................................*................................................................... + sub v14.4S, v9.4S, v20.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v12.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + str q11, [x1, #880] // ......................................................................................................................................................................................................................*................................................................. + mls v19.4S, v14.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + str q12, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a72.s b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a72.s new file mode 100644 index 00000000..8ae4f0b4 --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_a72.s @@ -0,0 +1,1794 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_opt_a72 + .global _intt_dilithium_1234_5678_opt_a72 + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_opt_a72: +_intt_dilithium_1234_5678_opt_a72: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ld4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x0] // *................................................. + ldr q18, [x3, #48] // .*................................................ + ldr q22, [x3, #64] // ..*............................................... + ldr q26, [x3, #32] // ....*............................................. + ldr q30, [x3], #(6*16) // ...*.............................................. + // gap // .................................................. + ldr q27, [x3, #-16] // .....*............................................ + ldr q15, [x3, #-80] // ........*......................................... + // gap // .................................................. + ldr q7, [x4], #8 // ...................................*.............. + // gap // .................................................. + // gap // .................................................. + add v24.4S, v10.4S, v11.4S // ......*........................................... + sub v10.4S, v10.4S, v11.4S // .......*.......................................... + // gap // .................................................. + add v8.4S, v12.4S, v13.4S // .........*........................................ + sub v13.4S, v12.4S, v13.4S // ..........*....................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v18.4S, v10.4S, v18.4S // ...........*...................................... + // gap // .................................................. + // gap // .................................................. + sub v6.4S, v24.4S, v8.4S // ............*..................................... + // gap // .................................................. + // gap // .................................................. + mul v11.4S, v13.4S, v22.4S // .................*................................ + add v24.4S, v24.4S, v8.4S // ..............*................................... + ldr q22, [x4], #16 // ....................................*............. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v27.4S, v13.4S, v27.4S // ...............*.................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v10.4S, v10.4S, v26.4S // .............*.................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v10.4S, v18.4S, v29.4S // ................*................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v11.4S, v27.4S, v29.4S // ..................*............................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v18.4S, v6.4S, v15.4S // ......................*........................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v26.4S, v6.4S, v30.4S // ...................*.............................. + // gap // .................................................. + // gap // .................................................. + sub v27.4S, v10.4S, v11.4S // ....................*............................. + // gap // .................................................. + // gap // .................................................. + add v10.4S, v10.4S, v11.4S // .....................*............................ + // gap // .................................................. + // gap // .................................................. + mls v26.4S, v18.4S, v29.4S // ...........................*...................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v18.4S, v27.4S, v15.4S // .......................*.......................... + trn1 v13.4S, v24.4S, v10.4S // ........................*......................... + // gap // .................................................. + trn2 v10.4S, v24.4S, v10.4S // ..........................*....................... + // gap // .................................................. + // gap // .................................................. + mul v30.4S, v27.4S, v30.4S // .........................*........................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v30.4S, v18.4S, v29.4S // ............................*..................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v18.4S, v26.4S, v30.4S // .............................*.................... + trn2 v26.4S, v26.4S, v30.4S // ..............................*................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn2 v27.2D, v13.2D, v18.2D // ................................*................. + trn2 v30.2D, v10.2D, v26.2D // ...............................*.................. + // gap // .................................................. + trn1 v18.2D, v13.2D, v18.2D // .................................*................ + trn1 v10.2D, v10.2D, v26.2D // ..................................*............... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v26.4S, v27.4S, v30.4S // ......................................*........... + sub v2.4S, v27.4S, v30.4S // ...............................................*.. + // gap // .................................................. + add v30.4S, v18.4S, v10.4S // .....................................*............ + sub v10.4S, v18.4S, v10.4S // .......................................*.......... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v18.4S, v30.4S, v26.4S // ........................................*......... + sub v26.4S, v30.4S, v26.4S // .........................................*........ + // gap // .................................................. + sqrdmulh v23.4S, v10.4S, v22.S[1] // ..........................................*....... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + srshr v30.4S, v18.4S, #23 // ...........................................*...... + mul v24.4S, v26.4S, v7.S[0] // ............................................*..... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v18.4S, v30.4S, v29.4S // .............................................*.... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v16.4S, v26.4S, v7.S[1] // ..............................................*... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v25.4S, v10.4S, v22.S[0] // ................................................*. + // gap // .................................................. + // gap // .................................................. + str q18, [x0], #(16*4) // .................................................* + // gap // .................................................. + // gap // .................................................. + + // original source code + // ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x0] // *................................................. + // ldr q20, [x3, #48] // .*................................................ + // ldr q18, [x3, #64] // ..*............................................... + // ldr q6, [x3], #(6*16) // ....*............................................. + // ldr q0, [x3, #-64] // ...*.............................................. + // ldr q4, [x3, #-16] // .....*............................................ + // add v31.4S, v11.4S, v12.4S // ........*......................................... + // sub v12.4S, v11.4S, v12.4S // .........*........................................ + // ldr q5, [x3, #-80] // ......*........................................... + // add v17.4S, v13.4S, v14.4S // ..........*....................................... + // sub v11.4S, v13.4S, v14.4S // ...........*...................................... + // sqrdmulh v3.4S, v12.4S, v20.4S // ............*..................................... + // sub v30.4S, v31.4S, v17.4S // .............*.................................... + // mul v14.4S, v12.4S, v0.4S // ..................*............................... + // add v28.4S, v31.4S, v17.4S // ...............*.................................. + // sqrdmulh v19.4S, v11.4S, v4.4S // .................*................................ + // mls v14.4S, v3.4S, v29.4S // ...................*.............................. + // mul v13.4S, v11.4S, v18.4S // ..............*................................... + // mls v13.4S, v19.4S, v29.4S // ....................*............................. + // mul v27.4S, v30.4S, v6.4S // ......................*........................... + // sub v3.4S, v14.4S, v13.4S // .......................*.......................... + // add v2.4S, v14.4S, v13.4S // ........................*......................... + // sqrdmulh v0.4S, v30.4S, v5.4S // .....................*............................ + // sqrdmulh v10.4S, v3.4S, v5.4S // ..........................*....................... + // trn1 v30.4S, v28.4S, v2.4S // ...........................*...................... + // mul v18.4S, v3.4S, v6.4S // .............................*.................... + // trn2 v2.4S, v28.4S, v2.4S // ............................*..................... + // mls v27.4S, v0.4S, v29.4S // .........................*........................ + // mls v18.4S, v10.4S, v29.4S // ..............................*................... + // trn1 v31.4S, v27.4S, v18.4S // ...............................*.................. + // trn2 v3.4S, v27.4S, v18.4S // ................................*................. + // trn2 v21.2D, v2.2D, v3.2D // ..................................*............... + // trn2 v0.2D, v30.2D, v31.2D // .................................*................ + // trn1 v27.2D, v30.2D, v31.2D // ...................................*.............. + // trn1 v19.2D, v2.2D, v3.2D // ....................................*............. + // ldr q7, [x4], #8 // .......*.......................................... + // ldr q22, [x4], #16 // ................*................................. + // add v2.4S, v27.4S, v19.4S // .......................................*.......... + // add v4.4S, v0.4S, v21.4S // .....................................*............ + // sub v3.4S, v27.4S, v19.4S // ........................................*......... + // add v18.4S, v2.4S, v4.4S // .........................................*........ + // sub v2.4S, v2.4S, v4.4S // ..........................................*....... + // sqrdmulh v23.4S, v3.4S, v22.S[1] // ...........................................*...... + // srshr v10.4S, v18.4S, #23 // ............................................*..... + // mul v24.4S, v2.4S, v7.S[0] // .............................................*.... + // mls v18.4S, v10.4S, v29.4S // ..............................................*... + // sqrdmulh v16.4S, v2.4S, v7.S[1] // ...............................................*.. + // sub v2.4S, v0.4S, v21.4S // ......................................*........... + // mul v25.4S, v3.4S, v22.S[0] // ................................................*. + // str q18, [x0], #(16*4) // .................................................* + + sub count, count, #1 +layer5678_start: + ld4 {v11.4S, v12.4S, v13.4S, v14.4S}, [x0] // e................................................................ + mul v10.4S, v2.4S, v22.S[2] // ............................................*.................... + ldr q20, [x3, #48] // ....e............................................................ + ldr q18, [x3, #64] // .....e........................................................... + ldr q6, [x3], #(6*16) // .e............................................................... + // gap // ................................................................. + mls v25.4S, v23.4S, v29.4S // .........................................*....................... + ldr q0, [x3, #-64] // ...e............................................................. + // gap // ................................................................. + ldr q4, [x3, #-16] // ......e.......................................................... + // gap // ................................................................. + // gap // ................................................................. + add v31.4S, v11.4S, v12.4S // ........e........................................................ + sub v12.4S, v11.4S, v12.4S // .......e......................................................... + ldr q5, [x3, #-80] // ..e.............................................................. + sqrdmulh v21.4S, v2.4S, v22.S[3] // .............................................*................... + add v17.4S, v13.4S, v14.4S // .............e................................................... + // gap // ................................................................. + sub v11.4S, v13.4S, v14.4S // ............e.................................................... + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v3.4S, v12.4S, v20.4S // ..........e...................................................... + // gap // ................................................................. + // gap // ................................................................. + sub v30.4S, v31.4S, v17.4S // .................e............................................... + // gap // ................................................................. + // gap // ................................................................. + mul v14.4S, v12.4S, v0.4S // .........e....................................................... + add v28.4S, v31.4S, v17.4S // ..................e.............................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v19.4S, v11.4S, v4.4S // ...............e................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v14.4S, v3.4S, v29.4S // ...........e..................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v13.4S, v11.4S, v18.4S // ..............e.................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v13.4S, v19.4S, v29.4S // ................e................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v10.4S, v21.4S, v29.4S // ..............................................*.................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v27.4S, v30.4S, v6.4S // ...................e............................................. + // gap // ................................................................. + // gap // ................................................................. + sub v3.4S, v14.4S, v13.4S // ......................e.......................................... + // gap // ................................................................. + // gap // ................................................................. + add v2.4S, v14.4S, v13.4S // .......................e......................................... + sqrdmulh v0.4S, v30.4S, v5.4S // ....................e............................................ + // gap // ................................................................. + sub v8.4S, v25.4S, v10.4S // ....................................................*............ + // gap // ................................................................. + // gap // ................................................................. + add v13.4S, v25.4S, v10.4S // .....................................................*........... + sqrdmulh v10.4S, v3.4S, v5.4S // .........................e....................................... + // gap // ................................................................. + trn1 v30.4S, v28.4S, v2.4S // ...........................e..................................... + // gap // ................................................................. + // gap // ................................................................. + mul v18.4S, v3.4S, v6.4S // ........................e........................................ + trn2 v2.4S, v28.4S, v2.4S // ............................e.................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v27.4S, v0.4S, v29.4S // .....................e........................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v18.4S, v10.4S, v29.4S // ..........................e...................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v14.4S, v8.4S, v7.S[1] // .......................................................*......... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn1 v31.4S, v27.4S, v18.4S // .............................e................................... + trn2 v3.4S, v27.4S, v18.4S // ..............................e.................................. + // gap // ................................................................. + mls v24.4S, v16.4S, v29.4S // ...................................................*............. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + srshr v25.4S, v13.4S, #23 // ...........................................................*..... + trn2 v21.2D, v2.2D, v3.2D // ................................e................................ + trn2 v0.2D, v30.2D, v31.2D // ...............................e................................. + // gap // ................................................................. + trn1 v27.2D, v30.2D, v31.2D // .................................e............................... + trn1 v19.2D, v2.2D, v3.2D // ..................................e.............................. + // gap // ................................................................. + mul v26.4S, v8.4S, v7.S[0] // ......................................................*.......... + ldr q7, [x4], #8 // ...................................e............................. + ldr q22, [x4], #16 // ....................................e............................ + str q24, [x0, #-32] // ...............................................................*. + // gap // ................................................................. + // gap // ................................................................. + add v2.4S, v27.4S, v19.4S // ......................................e.......................... + add v4.4S, v0.4S, v21.4S // ...........................................e..................... + // gap // ................................................................. + sub v3.4S, v27.4S, v19.4S // .....................................e........................... + mls v26.4S, v14.4S, v29.4S // ........................................................*........ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v13.4S, v25.4S, v29.4S // ............................................................*.... + add v18.4S, v2.4S, v4.4S // ................................................e................ + // gap // ................................................................. + sub v2.4S, v2.4S, v4.4S // ...............................................e................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v23.4S, v3.4S, v22.S[1] // ........................................e........................ + // gap // ................................................................. + // gap // ................................................................. + str q26, [x0, #-16] // ................................................................* + srshr v10.4S, v18.4S, #23 // .........................................................e....... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v24.4S, v2.4S, v7.S[0] // .................................................e............... + str q13, [x0, #-48] // ..............................................................*.. + // gap // ................................................................. + // gap // ................................................................. + mls v18.4S, v10.4S, v29.4S // ..........................................................e...... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v16.4S, v2.4S, v7.S[1] // ..................................................e.............. + sub v2.4S, v0.4S, v21.4S // ..........................................e...................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v25.4S, v3.4S, v22.S[0] // .......................................e......................... + // gap // ................................................................. + // gap // ................................................................. + str q18, [x0], #(16*4) // .............................................................e... + // gap // ................................................................. + // gap // ................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e................................................................e........................................................... + // ldr q0, [x3], #(6*16) // ....e............................................................|...e....................................................... + // ldr q4, [x3, #(-6*16 + 1*16)] // ..........e......................................................|.........e................................................. + // ldr q1, [x3, #(-6*16 + 2*16)] // ......e..........................................................|.....e..................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ..e..............................................................|.e......................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // ...e.............................................................|..e........................................................ + // ldr q6, [x3, #(-6*16 + 5*16)] // .......e.........................................................|......e.................................................... + // sub v24.4s, v8.4s, v9.4s // .........e.......................................................|........e.................................................. + // add v8.4s, v8.4s, v9.4s // ........e........................................................|.......e................................................... + // mul v9.4s, v24.4s, v1.4s // ................e................................................|...............e........................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ..............e..................................................|.............e............................................. + // mls v9.4s, v24.4s, v29.4s // ...................e.............................................|..................e........................................ + // sub v24.4s, v10.4s, v11.4s // .............e...................................................|............e.............................................. + // add v10.4s, v10.4s, v11.4s // ............e....................................................|...........e............................................... + // mul v11.4s, v24.4s, v2.4s // ....................e............................................|...................e....................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..................e..............................................|.................e......................................... + // mls v11.4s, v24.4s, v29.4s // .....................e...........................................|....................e...................................... + // sub v24.4s, v8.4s, v10.4s // ...............e.................................................|..............e............................................ + // add v8.4s, v8.4s, v10.4s // .................e...............................................|................e.......................................... + // mul v10.4s, v24.4s, v0.4s // .......................e.........................................|......................e.................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................e......................................|.........................e................................. + // mls v10.4s, v24.4s, v29.4s // .................................e...............................|................................e.......................... + // sub v24.4s, v9.4s, v11.4s // ........................e........................................|.......................e................................... + // add v9.4s, v9.4s, v11.4s // .........................e.......................................|........................e.................................. + // mul v11.4s, v24.4s, v0.4s // ...............................e.................................|..............................e............................ + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................e...................................|............................e.............................. + // mls v11.4s, v24.4s, v29.4s // ..................................e..............................|.................................e......................... + // trn1 v25.4s, v8.4s, v9.4s // ..............................e..................................|.............................e............................. + // trn2 v26.4s, v8.4s, v9.4s // ................................e................................|...............................e........................... + // trn1 v27.4s, v10.4s, v11.4s // ....................................e............................|...................................e....................... + // trn2 v28.4s, v10.4s, v11.4s // .....................................e...........................|....................................e...................... + // trn2 v10.2d, v25.2d, v27.2d // .........................................e.......................|........................................e.................. + // trn2 v11.2d, v26.2d, v28.2d // ........................................e........................|.......................................e................... + // trn1 v8.2d, v25.2d, v27.2d // ..........................................e......................|.........................................e................. + // trn1 v9.2d, v26.2d, v28.2d // ...........................................e.....................|..........................................e................ + // ldr q1, [x4], #8 // .............................................e...................|............................................e.............. + // ldr q0, [x4], #16 // ..............................................e..................|.............................................e............. + // sub v24.4s, v8.4s, v9.4s // ..................................................e..............|.................................................e......... + // add v8.4s, v8.4s, v9.4s // ................................................e................|...............................................e........... + // mul v9.4s, v24.4s, v0.s[0] // ...............................................................e.|........................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................e.........|......................................................e.... + // mls v9.4s, v24.4s, v29.4s // .....*...........................................................|....*...................................................... + // sub v24.4s, v10.4s, v11.4s // ..............................................................e..|........................................................... + // add v10.4s, v10.4s, v11.4s // .................................................e...............|................................................e.......... + // mul v11.4s, v24.4s, v0.s[2] // .*...............................................................|*.......................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........*.....................................................|..........*................................................ + // mls v11.4s, v24.4s, v29.4s // ......................*..........................................|.....................*..................................... + // sub v24.4s, v8.4s, v10.4s // ......................................................e..........|.....................................................e..... + // add v8.4s, v8.4s, v10.4s // .....................................................e...........|....................................................e...... + // mul v10.4s, v24.4s, v1.s[0] // ..........................................................e......|.........................................................e. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................................e...|........................................................... + // mls v10.4s, v24.4s, v29.4s // ......................................*..........................|.....................................*..................... + // sub v24.4s, v9.4s, v11.4s // ...........................*.....................................|..........................*................................ + // add v9.4s, v9.4s, v11.4s // ............................*....................................|...........................*............................... + // mul v11.4s, v24.4s, v1.s[0] // ............................................*....................|...........................................*............... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................*.............................|..................................*........................ + // mls v11.4s, v24.4s, v29.4s // ...................................................*.............|..................................................*........ + // srshr v24.4S, v8.4S, #23 // .........................................................e.......|........................................................e.. + // mls v8.4s, v24.4s, v29.4s // ............................................................e....|........................................................... + // srshr v24.4S, v9.4S, #23 // .......................................*.........................|......................................*.................... + // mls v9.4s, v24.4s, v29.4s // ....................................................*............|...................................................*....... + // str q8, [x0], #(16*4) // ................................................................e|........................................................... + // str q9, [x0, #(-16*4 + 1*16)] // ...........................................................*.....|..........................................................* + // str q10, [x0, #(-16*4 + 2*16)] // ...............................................*.................|..............................................*............ + // str q11, [x0, #(-16*4 + 3*16)] // ........................................................*........|.......................................................*... + + sub count, count, #1 + cbnz count, layer5678_start + sqrdmulh v20.4S, v2.4S, v22.S[3] // ..*............ + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mul v4.4S, v2.4S, v22.S[2] // *.............. + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v25.4S, v23.4S, v29.4S // .*............. + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v4.4S, v20.4S, v29.4S // ...*........... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v24.4S, v16.4S, v29.4S // .......*....... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + sub v30.4S, v25.4S, v4.4S // ....*.......... + add v10.4S, v25.4S, v4.4S // .....*......... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + str q24, [x0, #-32] // ..........*.... + // gap // ............... + // gap // ............... + // gap // ............... + sqrdmulh v28.4S, v30.4S, v7.S[1] // ......*........ + srshr v4.4S, v10.4S, #23 // ........*...... + // gap // ............... + // gap // ............... + // gap // ............... + mul v23.4S, v30.4S, v7.S[0] // .........*..... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v10.4S, v4.4S, v29.4S // ............*.. + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + mls v23.4S, v28.4S, v29.4S // ...........*... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + str q10, [x0, #-48] // ..............* + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + // gap // ............... + str q23, [x0, #-16] // .............*. + // gap // ............... + // gap // ............... + + // original source code + // mul v10.4S, v2.4S, v22.S[2] // .*............. + // mls v25.4S, v23.4S, v29.4S // ..*............ + // sqrdmulh v21.4S, v2.4S, v22.S[3] // *.............. + // mls v10.4S, v21.4S, v29.4S // ...*........... + // sub v8.4S, v25.4S, v10.4S // .....*......... + // add v13.4S, v25.4S, v10.4S // ......*........ + // sqrdmulh v14.4S, v8.4S, v7.S[1] // ........*...... + // mls v24.4S, v16.4S, v29.4S // ....*.......... + // srshr v25.4S, v13.4S, #23 // .........*..... + // mul v26.4S, v8.4S, v7.S[0] // ..........*.... + // str q24, [x0, #-32] // .......*....... + // mls v26.4S, v14.4S, v29.4S // ............*.. + // mls v13.4S, v25.4S, v29.4S // ...........*... + // str q26, [x0, #-16] // ..............* + // str q13, [x0, #-48] // .............*. + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q10, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q18, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q22, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q27, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q13, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q15, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q24, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q8, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + sub v11.4S, v18.4S, v10.4S // ...............................*........................................................................................................................................................................................................................................................ + add v10.4S, v18.4S, v10.4S // ................................*....................................................................................................................................................................................................................................................... + ldr q18, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + ldr q23, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + add v16.4S, v22.4S, v15.4S // .....................................*.................................................................................................................................................................................................................................................. + sub v9.4S, v22.4S, v15.4S // ....................................*................................................................................................................................................................................................................................................... + ldr q20, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + sub v12.4S, v13.4S, v27.4S // .....................*.................................................................................................................................................................................................................................................................. + ldr q22, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + sqrdmulh v17.4S, v11.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + add v27.4S, v13.4S, v27.4S // ......................*................................................................................................................................................................................................................................................................. + mul v15.4S, v11.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + add v13.4S, v8.4S, v24.4S // .................*...................................................................................................................................................................................................................................................................... + sub v24.4S, v8.4S, v24.4S // ................*....................................................................................................................................................................................................................................................................... + add v19.4S, v23.4S, v20.4S // ...........................*............................................................................................................................................................................................................................................................ + sqrdmulh v21.4S, v12.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + sub v11.4S, v13.4S, v27.4S // ........................................................*............................................................................................................................................................................................................................... + mul v28.4S, v12.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + add v8.4S, v13.4S, v27.4S // .........................................................*.............................................................................................................................................................................................................................. + add v12.4S, v19.4S, v10.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v28.4S, v21.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + sub v21.4S, v8.4S, v12.4S // ................................................................................................*....................................................................................................................................................................................... + add v8.4S, v8.4S, v12.4S // .................................................................................................*...................................................................................................................................................................................... + sub v10.4S, v19.4S, v10.4S // ..................................................................*..................................................................................................................................................................................................................... + mul v13.4S, v11.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + sub v14.4S, v18.4S, v22.4S // .........................................*.............................................................................................................................................................................................................................................. + add v27.4S, v18.4S, v22.4S // ..........................................*............................................................................................................................................................................................................................................. + sqrdmulh v22.4S, v11.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + sub v23.4S, v23.4S, v20.4S // ..........................*............................................................................................................................................................................................................................................................. + mls v15.4S, v17.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v17.4S, v24.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + sqrdmulh v18.4S, v9.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + mul v20.4S, v24.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + mls v20.4S, v17.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v10.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + mul v17.4S, v23.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + sub v19.4S, v20.4S, v28.4S // .............................................................*.......................................................................................................................................................................................................................... + add v12.4S, v20.4S, v28.4S // ..............................................................*......................................................................................................................................................................................................................... + mul v28.4S, v10.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + sqrdmulh v10.4S, v23.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + mls v28.4S, v24.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + sqrdmulh v24.4S, v19.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + mls v17.4S, v10.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + ldr q10, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + mul v23.4S, v19.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mls v23.4S, v24.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + ldr q24, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + add v19.4S, v17.4S, v15.4S // ........................................................................*............................................................................................................................................................................................................... + mls v13.4S, v22.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sub v15.4S, v17.4S, v15.4S // .......................................................................*................................................................................................................................................................................................................ + ldr q22, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + add v20.4S, v24.4S, v10.4S // ....................................................*................................................................................................................................................................................................................................... + sub v10.4S, v24.4S, v10.4S // ...................................................*.................................................................................................................................................................................................................................... + mul v17.4S, v9.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + sub v9.4S, v16.4S, v27.4S // ............................................................................*........................................................................................................................................................................................................... + add v11.4S, v16.4S, v27.4S // .............................................................................*.......................................................................................................................................................................................................... + ldr q27, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v15.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + mul v16.4S, v15.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + mls v17.4S, v18.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sub v18.4S, v27.4S, v22.4S // ..............................................*......................................................................................................................................................................................................................................... + add v27.4S, v27.4S, v22.4S // ...............................................*........................................................................................................................................................................................................................................ + mls v16.4S, v24.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + mul v15.4S, v10.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v10.4S, v10.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + sqrdmulh v24.4S, v18.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v9.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mls v15.4S, v10.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + mul v10.4S, v18.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + mls v10.4S, v24.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sub v24.4S, v12.4S, v19.4S // .....................................................................................................*.................................................................................................................................................................................. + mul v18.4S, v9.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + add v9.4S, v12.4S, v19.4S // ......................................................................................................*................................................................................................................................................................................. + sub v12.4S, v27.4S, v20.4S // ......................................................................................*................................................................................................................................................................................................. + mls v18.4S, v22.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + sub v22.4S, v10.4S, v15.4S // ...........................................................................................*............................................................................................................................................................................................ + add v27.4S, v27.4S, v20.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v19.4S, v12.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + add v15.4S, v10.4S, v15.4S // ............................................................................................*........................................................................................................................................................................................... + mul v10.4S, v12.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + add v20.4S, v11.4S, v27.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v27.4S, v11.4S, v27.4S // ....................................................................................................................*................................................................................................................................................................... + sqrdmulh v12.4S, v24.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + add v11.4S, v23.4S, v16.4S // ................................................................................................................*....................................................................................................................................................................... + mul v24.4S, v24.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + sub v16.4S, v23.4S, v16.4S // ...............................................................................................................*........................................................................................................................................................................ + mls v10.4S, v19.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + sub v19.4S, v8.4S, v20.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v20.4S // .........................................................................................................................................*.............................................................................................................................................. + sub v20.4S, v13.4S, v28.4S // ..........................................................................................................*............................................................................................................................................................................. + sqrdmulh v23.4S, v14.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + add v13.4S, v13.4S, v28.4S // ...........................................................................................................*............................................................................................................................................................................ + mul v14.4S, v14.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sub v28.4S, v18.4S, v10.4S // ..............................................................................................................................*......................................................................................................................................................... + add v18.4S, v18.4S, v10.4S // ...............................................................................................................................*........................................................................................................................................................ + sqrdmulh v10.4S, v22.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mls v14.4S, v23.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + mul v23.4S, v22.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + mls v23.4S, v10.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + add v22.4S, v17.4S, v14.4S // ..................................................................................*..................................................................................................................................................................................................... + sub v14.4S, v17.4S, v14.4S // .................................................................................*...................................................................................................................................................................................................... + mls v24.4S, v12.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + add v12.4S, v22.4S, v15.4S // ..........................................................................................................................*............................................................................................................................................................. + sqrdmulh v17.4S, v16.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + sub v15.4S, v22.4S, v15.4S // .........................................................................................................................*.............................................................................................................................................................. + sqrdmulh v22.4S, v27.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + sub v10.4S, v9.4S, v12.4S // .............................................................................................................................................*.......................................................................................................................................... + mul v27.4S, v27.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + add v9.4S, v9.4S, v12.4S // ..............................................................................................................................................*......................................................................................................................................... + sqrdmulh v12.4S, v10.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + mul v16.4S, v16.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + mls v16.4S, v17.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mul v17.4S, v10.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + sqrdmulh v10.4S, v19.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + mul v19.4S, v19.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + mls v17.4S, v12.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mul v12.4S, v21.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mls v27.4S, v22.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + mls v19.4S, v10.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + mls v12.4S, v21.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + sqrdmulh v10.4S, v14.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mul v22.4S, v28.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + sub v21.4S, v12.4S, v27.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v27.4S // .............................................................................................................................................................*.......................................................................................................................... + mul v27.4S, v14.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + mls v27.4S, v10.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + add v10.4S, v13.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + sub v18.4S, v13.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + mul v13.4S, v15.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + sqrdmulh v15.4S, v15.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + sub v14.4S, v27.4S, v23.4S // ...................................................................................................................................*.................................................................................................................................................... + sub count, count, #1 +layer1234_start: + sqrdmulh v28.4S, v28.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + add v23.4S, v27.4S, v23.4S // ....................................................................................................................................*................................................................................................................................................... + cmge v27.4S, v31.4S, v19.4S // ................................................................................................................................................................................*....................................................................................................... + mls v13.4S, v15.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sub v15.4S, v11.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ + add v11.4S, v11.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... + sqrdmulh v23.4S, v20.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mls v22.4S, v28.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + cmge v28.4S, v19.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mul v20.4S, v20.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sub v28.4S, v27.4S, v28.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v27.4S, v14.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mls v20.4S, v23.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + mul v23.4S, v14.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mls v23.4S, v27.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mls v19.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v28.4S, v24.4S, v13.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v24.4S, v13.4S // ..................................................................................................................................................................*..................................................................................................................... + sqrdmulh v27.4S, v15.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + sub v24.4S, v20.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v20.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ + mul v22.4S, v15.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + add v15.4S, v16.4S, v23.4S // ............................................................................................................................................................................*........................................................................................................... + str q19, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sub v19.4S, v16.4S, v23.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v23.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + sqrdmulh v16.4S, v18.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v20.4S, v18.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + mls v22.4S, v27.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + cmge v18.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mul v27.4S, v19.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sub v18.4S, v23.4S, v18.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v19.4S, v19.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mls v17.4S, v18.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sqrdmulh v18.4S, v21.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + mul v21.4S, v21.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mls v20.4S, v16.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + cmge v16.4S, v22.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + sqrdmulh v23.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v9.4S, v31.4S, v22.4S // ............................................................................................................................................................................................*........................................................................................... + mls v21.4S, v18.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sub v16.4S, v9.4S, v16.4S // ..............................................................................................................................................................................................*......................................................................................... + sqrdmulh v18.4S, v24.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v24.4S, v24.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mls v27.4S, v19.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v19.4S, v31.4S, v20.4S // ........................................................................................................................................................................................*............................................................................................... + mls v24.4S, v18.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v18.4S, v20.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + sqrdmulh v9.4S, v28.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + sub v19.4S, v19.4S, v18.4S // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v18.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v20.4S, v19.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + mls v22.4S, v16.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + str q20, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + mul v11.4S, v28.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + str q22, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + cmge v16.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................*............................................................................... + ldr q22, [x1, #464] // .......e................................................................................................................................................................................................................................................................................ + mls v11.4S, v9.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v9.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v20.4S, v31.4S, v27.4S // ............................................................................................................................................................................................................*........................................................................... + mls v19.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + cmge v18.4S, v27.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + sub v9.4S, v16.4S, v9.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v17.4S, v23.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + cmge v23.4S, v21.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + sub v18.4S, v20.4S, v18.4S // ..............................................................................................................................................................................................................*......................................................................... + mls v24.4S, v9.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + cmge v9.4S, v11.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v16.4S, v31.4S, v11.4S // ....................................................................................................................................................................................................*................................................................................... + mls v27.4S, v18.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v18.4S, v31.4S, v21.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v20.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + sub v9.4S, v16.4S, v9.4S // ......................................................................................................................................................................................................*................................................................................. + sqrdmulh v28.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v18.4S, v18.4S, v23.4S // ..................................................................................................................................................................................................*..................................................................................... + str q24, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + cmge v8.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + str q27, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + cmge v24.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + mls v21.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sub v18.4S, v24.4S, v20.4S // ......................................................................................................................................................................................................................................................*................................. + ldr q24, [x1, #400] // ......e................................................................................................................................................................................................................................................................................. + cmge v27.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v11.4S, v9.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sub v23.4S, v8.4S, v27.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v16.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + ldr q8, [x1, #272] // ....e................................................................................................................................................................................................................................................................................... + str q21, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v27.4S, v24.4S, v22.4S // ...............................e........................................................................................................................................................................................................................................................ + sqrdmulh v9.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + str q11, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + mul v13.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v21.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v17.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + sqrdmulh v18.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + str q19, [x1, #192] // ...................................................................................................................................................................................................................................................................................*.... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sqrdmulh v17.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + sub v15.4S, v11.4S, v21.4S // ..................................................................................................................................................................................................................................................*..................................... + mul v12.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + sqrdmulh v11.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + add v10.4S, v24.4S, v22.4S // ................................e....................................................................................................................................................................................................................................................... + mls v13.4S, v9.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + mls v20.4S, v18.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + cmge v22.4S, v31.4S, v13.4S // ....................................................................................................................................................................................................................................................................*................... + ldr q19, [x1, #336] // .....e.................................................................................................................................................................................................................................................................................. + cmge v18.4S, v13.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + mls v12.4S, v11.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sqrdmulh v21.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + sub v28.4S, v22.4S, v18.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v9.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + cmge v24.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v11.4S, v31.4S, v12.4S // ........................................................................................................................................................................................................................................................*............................... + mls v23.4S, v17.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v18.4S, v12.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + sub v24.4S, v24.4S, v9.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v22.4S, v21.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + sub v18.4S, v11.4S, v18.4S // ..........................................................................................................................................................................................................................................................*............................. + ldr q17, [x1, #592] // .........e.............................................................................................................................................................................................................................................................................. + mls v20.4S, v24.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + add v24.4S, v8.4S, v19.4S // ...........................e............................................................................................................................................................................................................................................................ + cmge v9.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v12.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + cmge v21.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v11.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + mls v13.4S, v28.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + cmge v18.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q20, [x1, #256] // ....................................................................................................................................................................................................................................................................................*... + sub v28.4S, v21.4S, v9.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v16.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v20.4S, v8.4S, v19.4S // ..........................e............................................................................................................................................................................................................................................................. + ldr q21, [x1, #80] // .e...................................................................................................................................................................................................................................................................................... + sqrdmulh v8.4S, v27.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + ldr q14, [x1, #720] // ...........e............................................................................................................................................................................................................................................................................ + ldr q9, [x1, #16] // e....................................................................................................................................................................................................................................................................................... + sub v11.4S, v11.4S, v18.4S // ..........................................................................................................................................................................................................................................................................*............. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + ldr q15, [x1, #208] // ...e.................................................................................................................................................................................................................................................................................... + ldr q28, [x1, #144] // ..e..................................................................................................................................................................................................................................................................................... + str q12, [x1, #128] // ..................................................................................................................................................................................................................................................................................*..... + str q13, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + add v13.4S, v24.4S, v10.4S // ...................................................................e.................................................................................................................................................................................................................... + sub v12.4S, v24.4S, v10.4S // ..................................................................e..................................................................................................................................................................................................................... + mul v19.4S, v20.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + ldr q24, [x1, #528] // ........e............................................................................................................................................................................................................................................................................... + add v10.4S, v9.4S, v21.4S // .................e...................................................................................................................................................................................................................................................................... + add v18.4S, v28.4S, v15.4S // ......................e................................................................................................................................................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + sub v28.4S, v28.4S, v15.4S // .....................e.................................................................................................................................................................................................................................................................. + str q23, [x1, #448] // .......................................................................................................................................................................................................................................................................................* + sub v21.4S, v9.4S, v21.4S // ................e....................................................................................................................................................................................................................................................................... + mls v22.4S, v11.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + add v9.4S, v10.4S, v18.4S // .........................................................e.............................................................................................................................................................................................................................. + sqrdmulh v11.4S, v28.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + ldr q16, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + sqrdmulh v15.4S, v21.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + mul v23.4S, v28.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + mul v22.4S, v21.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + add v21.4S, v24.4S, v17.4S // .....................................e.................................................................................................................................................................................................................................................. + sub v24.4S, v24.4S, v17.4S // ....................................e................................................................................................................................................................................................................................................... + ldr q17, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + ldr q28, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + mls v22.4S, v15.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + sub v15.4S, v10.4S, v18.4S // ........................................................e............................................................................................................................................................................................................................... + add v10.4S, v16.4S, v14.4S // ..........................................e............................................................................................................................................................................................................................................. + sub v16.4S, v16.4S, v14.4S // .........................................e.............................................................................................................................................................................................................................................. + mul v14.4S, v27.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + sub v27.4S, v9.4S, v13.4S // ................................................................................................e....................................................................................................................................................................................... + mls v23.4S, v11.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + add v18.4S, v21.4S, v10.4S // .............................................................................e.......................................................................................................................................................................................................... + sub v21.4S, v21.4S, v10.4S // ............................................................................e........................................................................................................................................................................................................... + mls v14.4S, v8.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + add v8.4S, v9.4S, v13.4S // .................................................................................................e...................................................................................................................................................................................... + add v11.4S, v17.4S, v28.4S // ....................................................e................................................................................................................................................................................................................................... + mls v19.4S, v20.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + sub v10.4S, v17.4S, v28.4S // ...................................................e.................................................................................................................................................................................................................................... + add v9.4S, v22.4S, v23.4S // ..............................................................e......................................................................................................................................................................................................................... + sqrdmulh v13.4S, v12.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + mul v12.4S, v12.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + add v28.4S, v19.4S, v14.4S // ........................................................................e............................................................................................................................................................................................................... + sqrdmulh v20.4S, v24.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + mul v17.4S, v24.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + sub v24.4S, v19.4S, v14.4S // .......................................................................e................................................................................................................................................................................................................ + sub v14.4S, v22.4S, v23.4S // .............................................................e.......................................................................................................................................................................................................................... + mls v12.4S, v13.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + sub v23.4S, v9.4S, v28.4S // .....................................................................................................e.................................................................................................................................................................................. + add v9.4S, v9.4S, v28.4S // ......................................................................................................e................................................................................................................................................................................. + mul v19.4S, v21.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + sqrdmulh v21.4S, v21.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + sqrdmulh v28.4S, v14.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + mul v22.4S, v14.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sqrdmulh v13.4S, v15.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mul v15.4S, v15.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + mls v19.4S, v21.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + mul v21.4S, v27.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + sqrdmulh v27.4S, v27.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v14.4S, v16.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + sqrdmulh v16.4S, v16.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + mls v22.4S, v28.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + ldr q28, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + mls v15.4S, v13.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + ldr q13, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + mls v17.4S, v20.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + mls v14.4S, v16.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + add v16.4S, v28.4S, v13.4S // ...............................................e........................................................................................................................................................................................................................................ + mls v21.4S, v27.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + sub v27.4S, v16.4S, v11.4S // ......................................................................................e................................................................................................................................................................................................. + add v16.4S, v16.4S, v11.4S // .......................................................................................e................................................................................................................................................................................................ + sqrdmulh v20.4S, v10.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + sqrdmulh v11.4S, v27.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + sub v28.4S, v28.4S, v13.4S // ..............................................e......................................................................................................................................................................................................................................... + mul v13.4S, v10.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sqrdmulh v10.4S, v28.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + mls v13.4S, v20.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + sub v20.4S, v15.4S, v12.4S // ..........................................................................................................e............................................................................................................................................................................. + add v12.4S, v15.4S, v12.4S // ...........................................................................................................e............................................................................................................................................................................ + mul v15.4S, v28.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + add v28.4S, v18.4S, v16.4S // .....................................................................................................................e.................................................................................................................................................................. + sub v18.4S, v18.4S, v16.4S // ....................................................................................................................e................................................................................................................................................................... + mls v15.4S, v10.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + sub v16.4S, v17.4S, v14.4S // .................................................................................e...................................................................................................................................................................................................... + add v14.4S, v17.4S, v14.4S // ..................................................................................e..................................................................................................................................................................................................... + sqrdmulh v10.4S, v18.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + sub v17.4S, v8.4S, v28.4S // ........................................................................................................................................e............................................................................................................................................... + add v8.4S, v8.4S, v28.4S // .........................................................................................................................................e.............................................................................................................................................. + mul v28.4S, v27.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + add v27.4S, v15.4S, v13.4S // ............................................................................................e........................................................................................................................................................................................... + sub v15.4S, v15.4S, v13.4S // ...........................................................................................e............................................................................................................................................................................................ + mul v13.4S, v18.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + mls v13.4S, v10.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + add v10.4S, v14.4S, v27.4S // ..........................................................................................................................e............................................................................................................................................................. + sub v14.4S, v14.4S, v27.4S // .........................................................................................................................e.............................................................................................................................................................. + mls v28.4S, v11.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + sub v18.4S, v9.4S, v10.4S // .............................................................................................................................................e.......................................................................................................................................... + add v9.4S, v9.4S, v10.4S // ..............................................................................................................................................e......................................................................................................................................... + mul v11.4S, v24.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + sqrdmulh v27.4S, v24.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + add v10.4S, v19.4S, v28.4S // ...............................................................................................................................e........................................................................................................................................................ + sub v28.4S, v19.4S, v28.4S // ..............................................................................................................................e......................................................................................................................................................... + sqrdmulh v19.4S, v23.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + mul v24.4S, v23.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + mls v24.4S, v19.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + mul v19.4S, v17.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. + sqrdmulh v23.4S, v17.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ + mls v11.4S, v27.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + mul v17.4S, v18.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + sqrdmulh v27.4S, v18.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + mls v19.4S, v23.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... + sub v23.4S, v22.4S, v11.4S // ...............................................................................................................e........................................................................................................................................................................ + add v11.4S, v22.4S, v11.4S // ................................................................................................................e....................................................................................................................................................................... + sqrdmulh v22.4S, v16.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + mls v17.4S, v27.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... + mul v27.4S, v16.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + mls v27.4S, v22.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + sqrdmulh v18.4S, v15.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + sqrdmulh v22.4S, v23.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... + mul v16.4S, v23.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + mul v23.4S, v15.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + mls v23.4S, v18.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + sub v18.4S, v12.4S, v10.4S // ..................................................................................................................................................e..................................................................................................................................... + add v10.4S, v12.4S, v10.4S // ...................................................................................................................................................e.................................................................................................................................... + add v12.4S, v21.4S, v13.4S // .............................................................................................................................................................e.......................................................................................................................... + sqrdmulh v15.4S, v14.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + sub v21.4S, v21.4S, v13.4S // ............................................................................................................................................................e........................................................................................................................... + mul v13.4S, v14.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + mls v16.4S, v22.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... + mul v22.4S, v28.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + sub v14.4S, v27.4S, v23.4S // ...................................................................................................................................e.................................................................................................................................................... + + // original source code + // ldr q8, [x1, #0] // ....................................................................................e.......................................................................................................................................|...............................................................................................................................................e........................ + // ldr q9, [x1, #(1*(512/8))] // .................................................................................e..........................................................................................................................................|............................................................................................................................................e........................... + // ldr q10, [x1, #(2*(512/8))] // ........................................................................................e...................................................................................................................................|...................................................................................................................................................e.................... + // ldr q11, [x1, #(3*(512/8))] // .......................................................................................e....................................................................................................................................|..................................................................................................................................................e..................... + // ldr q12, [x1, #(4*(512/8))] // ...............................e............................................................................................................................................................................................|..........................................................................................e............................................................................. + // ldr q13, [x1, #(5*(512/8))] // ......................................................e.....................................................................................................................................................................|.................................................................................................................e...................................................... + // ldr q14, [x1, #(6*(512/8))] // ..........................e.................................................................................................................................................................................................|.....................................................................................e.................................................................................. + // ldr q15, [x1, #(7*(512/8))] // e...........................................................................................................................................................................................................................|...........................................................e............................................................................................................ + // ldr q16, [x1, #(8*(512/8))] // ..............................................................................................e.............................................................................................................................|.........................................................................................................................................................e.............. + // ldr q17, [x1, #(9*(512/8))] // ....................................................................e.......................................................................................................................................................|...............................................................................................................................e........................................ + // ldr q18, [x1, #(10*(512/8))] // .........................................................................................................e..................................................................................................................|....................................................................................................................................................................e... + // ldr q19, [x1, #(11*(512/8))] // ...................................................................................e........................................................................................................................................|..............................................................................................................................................e......................... + // ldr q20, [x1, #(12*(512/8))] // .......................................................................................................................................................e....................................................................|........................................................................................................................................................................ + // ldr q21, [x1, #(13*(512/8))] // .........................................................................................................................................................e..................................................................|........................................................................................................................................................................ + // ldr q22, [x1, #(14*(512/8))] // ................................................................................................................e...........................................................................................................|........................................................................................................................................................................ + // ldr q23, [x1, #(15*(512/8))] // .................................................................................................................e..........................................................................................................|........................................................................................................................................................................ + // sub v24.4s, v8.4s, v9.4s // ....................................................................................................e.......................................................................................................................|...............................................................................................................................................................e........ + // add v8.4s, v8.4s, v9.4s // ...............................................................................................e............................................................................................................................|..........................................................................................................................................................e............. + // mul v9.4s, v24.4s, v3.s[2] // .............................................................................................................e..............................................................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ..........................................................................................................e.................................................................................................................|.....................................................................................................................................................................e.. + // mls v9.4s, v24.4s, v29.4s // ..................................................................................................................e.........................................................................................................|........................................................................................................................................................................ + // sub v24.4s, v10.4s, v11.4s // ..................................................................................................e.........................................................................................................................|.............................................................................................................................................................e.......... + // add v10.4s, v10.4s, v11.4s // ................................................................................................e...........................................................................................................................|...........................................................................................................................................................e............ + // mul v11.4s, v24.4s, v4.s[0] // ...........................................................................................................e................................................................................................................|......................................................................................................................................................................e. + // sqrdmulh v24.4s, v24.4s, v4.s[1] // .......................................................................................................e....................................................................................................................|..................................................................................................................................................................e..... + // mls v11.4s, v24.4s, v29.4s // ........................................................................................................................e...................................................................................................|........................................................................................................................................................................ + // sub v24.4s, v12.4s, v13.4s // ................................................................................e...........................................................................................................................................|...........................................................................................................................................e............................ + // add v12.4s, v12.4s, v13.4s // ......................................................................e.....................................................................................................................................................|.................................................................................................................................e...................................... + // mul v13.4s, v24.4s, v4.s[2] // .............................................................................................e..............................................................................................................................|........................................................................................................................................................e............... + // sqrdmulh v24.4s, v24.4s, v4.s[3] // .................................................................................................e..........................................................................................................................|............................................................................................................................................................e........... + // mls v13.4s, v24.4s, v29.4s // ..............................................................................................................................e.............................................................................................|........................................................................................................................................................................ + // sub v24.4s, v14.4s, v15.4s // .................................e..........................................................................................................................................................................................|............................................................................................e........................................................................... + // add v14.4s, v14.4s, v15.4s // ..................................................e.........................................................................................................................................................................|.............................................................................................................e.......................................................... + // mul v15.4s, v24.4s, v5.s[0] // ......................................................................................................................e.....................................................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.s[1] // ..................................................................................e.........................................................................................................................................|.............................................................................................................................................e.......................... + // mls v15.4s, v24.4s, v29.4s // ...........................................................................................................................e................................................................................................|........................................................................................................................................................................ + // sub v24.4s, v16.4s, v17.4s // ...............................................................................................................e............................................................................................................|........................................................................................................................................................................ + // add v16.4s, v16.4s, v17.4s // ..............................................................................................................e.............................................................................................................|........................................................................................................................................................................ + // mul v17.4s, v24.4s, v5.s[2] // .....................................................................................................................................e......................................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ....................................................................................................................................e.......................................................................................|........................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // ..........................................................................................................................................................e.................................................................|........................................................................................................................................................................ + // sub v24.4s, v18.4s, v19.4s // .....................................................................................................................e......................................................................................................|........................................................................................................................................................................ + // add v18.4s, v18.4s, v19.4s // ....................................................................................................................e.......................................................................................................|........................................................................................................................................................................ + // mul v19.4s, v24.4s, v6.s[0] // ....................................................................................................................................................e.......................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.s[1] // .....................................................................................................................................................e......................................................................|........................................................................................................................................................................ + // mls v19.4s, v24.4s, v29.4s // ...........................................................................................................................................................e................................................................|........................................................................................................................................................................ + // sub v24.4s, v20.4s, v21.4s // ..................................................................................................................................................................e.........................................................|........................................................................................................................................................................ + // add v20.4s, v20.4s, v21.4s // ............................................................................................................................................................e...............................................................|........................................................................................................................................................................ + // mul v21.4s, v24.4s, v6.s[2] // ........................................................................................................................................................................e...................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ....................................................................................................................................................................e.......................................................|........................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ...........................................................................................................................................................................e................................................|........................................................................................................................................................................ + // sub v24.4s, v22.4s, v23.4s // ...............................................................................................................................e............................................................................................|........................................................................................................................................................................ + // add v22.4s, v22.4s, v23.4s // .............................................................................................................................e..............................................................................................|........................................................................................................................................................................ + // mul v23.4s, v24.4s, v7.s[0] // ...................................................................................................................................................................e........................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ................................................................................................................................................................e...........................................................|........................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // .....................................................................................................................................................................e......................................................|........................................................................................................................................................................ + // sub v24.4s, v8.4s, v10.4s // ...................................................................................................................e........................................................................................................|........................................................................................................................................................................ + // add v8.4s, v8.4s, v10.4s // ......................................................................................................e.....................................................................................................................|.................................................................................................................................................................e...... + // mul v10.4s, v24.4s, v1.s[2] // ................................................................................................................................................e...........................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...............................................................................................................................................e............................................................................|........................................................................................................................................................................ + // mls v10.4s, v24.4s, v29.4s // ........................................................................................................................................................e...................................................................|........................................................................................................................................................................ + // sub v24.4s, v9.4s, v11.4s // .......................................................................................................................................e....................................................................................|........................................................................................................................................................................ + // add v9.4s, v9.4s, v11.4s // ................................................................................................................................e...........................................................................................|........................................................................................................................................................................ + // mul v11.4s, v24.4s, v1.s[2] // ..............................................................................................................................................e.............................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................................................................................................................e..............................................................................|........................................................................................................................................................................ + // mls v11.4s, v24.4s, v29.4s // ......................................................................................................................................................e.....................................................................|........................................................................................................................................................................ + // sub v24.4s, v12.4s, v14.4s // ............................................................................................e...............................................................................................................................|.......................................................................................................................................................e................ + // add v12.4s, v12.4s, v14.4s // ...........................................................................................e................................................................................................................................|......................................................................................................................................................e................. + // mul v14.4s, v24.4s, v2.s[0] // ..................................................................................................................................e.........................................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .................................................................................................................................e..........................................................................................|........................................................................................................................................................................ + // mls v14.4s, v24.4s, v29.4s // ........................................................................................................................................e...................................................................................|........................................................................................................................................................................ + // sub v24.4s, v13.4s, v15.4s // ......................................................................................................................................e.....................................................................................|........................................................................................................................................................................ + // add v13.4s, v13.4s, v15.4s // ...................................................................................................................................e........................................................................................|........................................................................................................................................................................ + // mul v15.4s, v24.4s, v2.s[0] // ...........................................................................................................................................................................................e................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ............................................................................................................................................................................................e...............................|........................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // ....................................................................................................................................................................................................e.......................|........................................................................................................................................................................ + // sub v24.4s, v16.4s, v18.4s // ..........................................................................................................................e.................................................................................................|........................................................................................................................................................................ + // add v16.4s, v16.4s, v18.4s // .........................................................................................................................e..................................................................................................|........................................................................................................................................................................ + // mul v18.4s, v24.4s, v2.s[2] // ...........................................................................................................................................e................................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ............................................................................................................................................e...............................................................................|........................................................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // .................................................................................................................................................e..........................................................................|........................................................................................................................................................................ + // sub v24.4s, v17.4s, v19.4s // ............................................................................................................................................................................e...............................................|........................................................................................................................................................................ + // add v17.4s, v17.4s, v19.4s // .............................................................................................................................................................................e..............................................|........................................................................................................................................................................ + // mul v19.4s, v24.4s, v2.s[2] // ............................................................................................................................................................................................................e...............|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........................................................................................................................................................................................................e.................|........................................................................................................................................................................ + // mls v19.4s, v24.4s, v29.4s // .............................................................................................................................................................................................................e..............|........................................................................................................................................................................ + // sub v24.4s, v20.4s, v22.4s // ..............................................................................................................................................................e.............................................................|........................................................................................................................................................................ + // add v20.4s, v20.4s, v22.4s // ...............................................................................................................................................................e............................................................|........................................................................................................................................................................ + // mul v22.4s, v24.4s, v3.s[0] // .................................................................................................................................................................................e..........................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .................................................................................................................................................................e..........................................................|........................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ........................................................................................................................................................................................e...................................|........................................................................................................................................................................ + // sub v24.4s, v21.4s, v23.4s // ...................................................................................................................................................................................e........................................|........................................................................................................................................................................ + // add v21.4s, v21.4s, v23.4s // ..................................................................................................................................................................................e.........................................|........................................................................................................................................................................ + // mul v23.4s, v24.4s, v3.s[0] // .................................................................................................................................................................................................................e..........|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..............................................................................................................................................................................................................e.............|........................................................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ..................................................................................................................................................................................................................e.........|........................................................................................................................................................................ + // sub v24.4s, v8.4s, v12.4s // .......................................................................................................................e....................................................................................................|........................................................................................................................................................................ + // add v8.4s, v8.4s, v12.4s // ............................................................................................................................e...............................................................................................|........................................................................................................................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ..................................................................................................................................................e.........................................................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................................................e........................................................................|........................................................................................................................................................................ + // mls v12.4s, v24.4s, v29.4s // .............................................................................................................................................................e..............................................................|........................................................................................................................................................................ + // sub v24.4s, v9.4s, v13.4s // .........................................................................................................................................e..................................................................................|........................................................................................................................................................................ + // add v9.4s, v9.4s, v13.4s // ..........................................................................................................................................e.................................................................................|........................................................................................................................................................................ + // mul v13.4s, v24.4s, v0.s[2] // ................................................................................................................................................................................................e...........................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................................................................e............................|........................................................................................................................................................................ + // mls v13.4s, v24.4s, v29.4s // .................................................................................................................................................................................................e..........................|........................................................................................................................................................................ + // sub v24.4s, v10.4s, v14.4s // ......................................................................................................................................................................e.....................................................|........................................................................................................................................................................ + // add v10.4s, v10.4s, v14.4s // .......................................................................................................................................................................e....................................................|........................................................................................................................................................................ + // mul v14.4s, v24.4s, v0.s[2] // ............................................................................................................................................................................................................................|........*............................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............................................................................................................................................................................................................................|.....*.................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|...........*............................................................................................................................................................ + // sub v24.4s, v11.4s, v15.4s // ........................................................................................................................................................................................................e...................|........................................................................................................................................................................ + // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................................................................................e..................|........................................................................................................................................................................ + // mul v15.4s, v24.4s, v0.s[2] // ................................................................................................................................................................................................................e...........|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................................................................................e............|........................................................................................................................................................................ + // mls v15.4s, v24.4s, v29.4s // .........................................................................................................................................................................................................................e..|........................................................................................................................................................................ + // sub v24.4s, v16.4s, v20.4s // ..........................................................................................................................................................................e.................................................|........................................................................................................................................................................ + // add v16.4s, v16.4s, v20.4s // .........................................................................................................................................................................e..................................................|........................................................................................................................................................................ + // mul v20.4s, v24.4s, v1.s[0] // ....................................................................................................................................................................................e.......................................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................................e.............................................|........................................................................................................................................................................ + // mls v20.4s, v24.4s, v29.4s // .....................................................................................................................................................................................e......................................|........................................................................................................................................................................ + // sub v24.4s, v17.4s, v21.4s // .......................................................................................................................................................................................e....................................|........................................................................................................................................................................ + // add v17.4s, v17.4s, v21.4s // ......................................................................................................................................................................................e.....................................|........................................................................................................................................................................ + // mul v21.4s, v24.4s, v1.s[0] // ........................................................................................................................................................................................................................e...|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................................................................................................................................................................................e.....|........................................................................................................................................................................ + // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|..*..................................................................................................................................................................... + // sub v24.4s, v18.4s, v22.4s // ..............................................................................................................................................................................................e.............................|........................................................................................................................................................................ + // add v18.4s, v18.4s, v22.4s // .............................................................................................................................................................................................e..............................|........................................................................................................................................................................ + // mul v22.4s, v24.4s, v1.s[0] // ..........................................................................................................................................................................................................................e.|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................................................................*........................................................................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|......*................................................................................................................................................................. + // sub v24.4s, v19.4s, v23.4s // ...........................................................................................................................................................................................................................e|........................................................................................................................................................................ + // add v19.4s, v19.4s, v23.4s // ............................................................................................................................................................................................................................|*....................................................................................................................................................................... + // mul v23.4s, v24.4s, v1.s[0] // ............................................................................................................................................................................................................................|............*........................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................................................................|..........*............................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|.............*.......................................................................................................................................................... + // sub v24.4s, v8.4s, v16.4s // ...............................................................................................................................................................................e............................................|........................................................................................................................................................................ + // add v8.4s, v8.4s, v16.4s // ................................................................................................................................................................................e...........................................|........................................................................................................................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................................................e.........................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................................................................e........................|........................................................................................................................................................................ + // mls v16.4s, v24.4s, v29.4s // .......................................................................................................................................................................................................e....................|........................................................................................................................................................................ + // sub v24.4s, v9.4s, v17.4s // .........................................................................................................................................................................................e..................................|........................................................................................................................................................................ + // add v9.4s, v9.4s, v17.4s // ..........................................................................................................................................................................................e.................................|........................................................................................................................................................................ + // mul v17.4s, v24.4s, v0.s[0] // .....................................................................................................................................................................................................e......................|........................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................................................................................e.....................|........................................................................................................................................................................ + // mls v17.4s, v24.4s, v29.4s // ...........................................................................................................................................................................................................e................|........................................................................................................................................................................ + // sub v24.4s, v10.4s, v18.4s // ...................................................................................................................................................................................................................e........|........................................................................................................................................................................ + // add v10.4s, v10.4s, v18.4s // ....................................................................................................................................................................................................................e.......|........................................................................................................................................................................ + // mul v18.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|..........................*............................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.........................*.............................................................................................................................................. + // mls v18.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|....................................*................................................................................................................................... + // sub v24.4s, v11.4s, v19.4s // ............................................................................................................................................................................................................................|...*.................................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ............................................................................................................................................................................................................................|....*................................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|....................*................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.................*...................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|...........................*............................................................................................................................................ + // sub v24.4s, v12.4s, v20.4s // .......................................................................................................................................................................................................................e....|........................................................................................................................................................................ + // add v12.4s, v12.4s, v20.4s // .....................................................................................................................................................................................................................e......|........................................................................................................................................................................ + // mul v20.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|..................................*..................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.................................*...................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|.........................................*.............................................................................................................................. + // sub v24.4s, v13.4s, v21.4s // ............................................................................................................................................................................................................................|...............*........................................................................................................................................................ + // add v13.4s, v13.4s, v21.4s // ............................................................................................................................................................................................................................|................*....................................................................................................................................................... + // mul v21.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|........................................................*............................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|.................................................*...................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // .*..........................................................................................................................................................................................................................|............................................................*........................................................................................................... + // sub v24.4s, v14.4s, v22.4s // ............................................................................................................................................................................................................................|..................*..................................................................................................................................................... + // add v14.4s, v14.4s, v22.4s // ............................................................................................................................................................................................................................|...................*.................................................................................................................................................... + // mul v22.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|............................................*........................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|...........................................*............................................................................................................................ + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|...............................................*........................................................................................................................ + // sub v24.4s, v15.4s, v23.4s // ............................................................................................................................................................................................................................|.......................*................................................................................................................................................ + // add v15.4s, v15.4s, v23.4s // ............................................................................................................................................................................................................................|.....................*.................................................................................................................................................. + // mul v23.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................................................|.............................*.......................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................................................|...............................*........................................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................................................|.............................................*.......................................................................................................................... + // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................................................................|.*...................................................................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................................................................|.......*................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|.........*.............................................................................................................................................................. + // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|..............*......................................................................................................................................................... + // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................................................|........................*............................................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................................................................|............................*........................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|..............................*......................................................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|................................*....................................................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ............................................................................................................................................................................................................................|..............................................*......................................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ............................................................................................................................................................................................................................|................................................*....................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|..................................................*..................................................................................................................... + // mls v18.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|....................................................*................................................................................................................... + // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................................................................|........................................*............................................................................................................................... + // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................................................................|.....................................*.................................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................................................|..........................................*............................................................................................................................. + // mls v19.4s, v28.4s, v29.4s // ............................................................................................................................................................................................................................|.....................................................*.................................................................................................................. + // cmge v27.4s, v31.4s, v20.4s // ..............*.............................................................................................................................................................................................................|.........................................................................*.............................................................................................. + // cmge v28.4s, v20.4s, v30.4s // ........*...................................................................................................................................................................................................................|...................................................................*.................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................*.........................................................................................................................................................................................................|.............................................................................*.......................................................................................... + // mls v20.4s, v28.4s, v29.4s // ........................*...................................................................................................................................................................................................|...................................................................................*.................................................................................... + // cmge v27.4s, v31.4s, v21.4s // ............*...............................................................................................................................................................................................................|.......................................................................*................................................................................................ + // cmge v28.4s, v21.4s, v30.4s // ...........*................................................................................................................................................................................................................|......................................................................*................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ................*...........................................................................................................................................................................................................|...........................................................................*............................................................................................ + // mls v21.4s, v28.4s, v29.4s // ............................*...............................................................................................................................................................................................|.......................................................................................*................................................................................ + // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................................................................|..........................................................*............................................................................................................. + // cmge v28.4s, v22.4s, v30.4s // ..*.........................................................................................................................................................................................................................|.............................................................*.......................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ......*.....................................................................................................................................................................................................................|.................................................................*...................................................................................................... + // mls v22.4s, v28.4s, v29.4s // ..........*.................................................................................................................................................................................................................|.....................................................................*.................................................................................................. + // cmge v27.4s, v31.4s, v23.4s // ...*........................................................................................................................................................................................................................|..............................................................*......................................................................................................... + // cmge v28.4s, v23.4s, v30.4s // .....*......................................................................................................................................................................................................................|................................................................*....................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .........*..................................................................................................................................................................................................................|....................................................................*................................................................................................... + // mls v23.4s, v28.4s, v29.4s // .............*..............................................................................................................................................................................................................|........................................................................*............................................................................................... + // str q16, [x1, #(8*(512/8))] // ............................................................................................................................................................................................................................|......................*................................................................................................................................................. + // str q17, [x1, #(9*(512/8))] // ............................................................................................................................................................................................................................|...................................*.................................................................................................................................... + // str q18, [x1, #(10*(512/8))] // ............................................................................................................................................................................................................................|.......................................................*................................................................................................................ + // str q19, [x1, #(11*(512/8))] // ............................................................................................................................................................................................................................|.........................................................*.............................................................................................................. + // str q20, [x1, #(12*(512/8))] // ................................*...........................................................................................................................................................................................|...........................................................................................*............................................................................ + // str q21, [x1, #(13*(512/8))] // ...................................*........................................................................................................................................................................................|..............................................................................................*......................................................................... + // str q22, [x1, #(14*(512/8))] // ...................*........................................................................................................................................................................................................|..............................................................................*......................................................................................... + // str q23, [x1, #(15*(512/8))] // ......................*.....................................................................................................................................................................................................|.................................................................................*...................................................................................... + // mul v16.4s, v8.4s, v25.4s // ....................*.......................................................................................................................................................................................................|...............................................................................*........................................................................................ + // sqrdmulh v8.4s, v8.4s, v26.4s // .................*..........................................................................................................................................................................................................|............................................................................*........................................................................................... + // mls v16.4s, v8.4s, v29.4s // ..............................*.............................................................................................................................................................................................|.........................................................................................*.............................................................................. + // mul v17.4s, v9.4s, v25.4s // ............................................................................................................................................................................................................................|.......................................*................................................................................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // ............................................................................................................................................................................................................................|......................................*................................................................................................................................. + // mls v17.4s, v9.4s, v29.4s // .......*....................................................................................................................................................................................................................|..................................................................*..................................................................................................... + // mul v18.4s, v10.4s, v25.4s // ................................................*...........................................................................................................................................................................|...........................................................................................................*............................................................ + // sqrdmulh v10.4s, v10.4s, v26.4s // .................................................*..........................................................................................................................................................................|............................................................................................................*........................................................... + // mls v18.4s, v10.4s, v29.4s // ........................................................*...................................................................................................................................................................|...................................................................................................................*.................................................... + // mul v19.4s, v11.4s, v25.4s // ............................................................................................................................................................................................................................|......................................................*................................................................................................................. + // sqrdmulh v11.4s, v11.4s, v26.4s // ............................................................................................................................................................................................................................|...................................................*.................................................................................................................... + // mls v19.4s, v11.4s, v29.4s // ....*.......................................................................................................................................................................................................................|...............................................................*........................................................................................................ + // mul v20.4s, v12.4s, v25.4s // ...........................................*................................................................................................................................................................................|......................................................................................................*................................................................. + // sqrdmulh v12.4s, v12.4s, v26.4s // .........................................*..................................................................................................................................................................................|....................................................................................................*................................................................... + // mls v20.4s, v12.4s, v29.4s // ....................................................*.......................................................................................................................................................................|...............................................................................................................*........................................................ + // mul v21.4s, v13.4s, v25.4s // ....................................*.......................................................................................................................................................................................|...............................................................................................*........................................................................ + // sqrdmulh v13.4s, v13.4s, v26.4s // ..................................*.........................................................................................................................................................................................|.............................................................................................*.......................................................................... + // mls v21.4s, v13.4s, v29.4s // ...................................................*........................................................................................................................................................................|..............................................................................................................*......................................................... + // mul v22.4s, v14.4s, v25.4s // ............................................................*...............................................................................................................................................................|.......................................................................................................................*................................................ + // sqrdmulh v14.4s, v14.4s, v26.4s // .........................................................*..................................................................................................................................................................|....................................................................................................................*................................................... + // mls v22.4s, v14.4s, v29.4s // ..................................................................*.........................................................................................................................................................|.............................................................................................................................*.......................................... + // mul v23.4s, v15.4s, v25.4s // .............................................*..............................................................................................................................................................................|........................................................................................................*............................................................... + // sqrdmulh v15.4s, v15.4s, v26.4s // ..............................................*.............................................................................................................................................................................|.........................................................................................................*.............................................................. + // mls v23.4s, v15.4s, v29.4s // ...............................................................*............................................................................................................................................................|..........................................................................................................................*............................................. + // cmge v27.4s, v31.4s, v16.4s // .....................................*......................................................................................................................................................................................|................................................................................................*....................................................................... + // cmge v28.4s, v16.4s, v30.4s // .......................................*....................................................................................................................................................................................|..................................................................................................*..................................................................... + // sub v28.4s, v27.4s, v28.4s // ...............................................*............................................................................................................................................................................|..........................................................................................................*............................................................. + // mls v16.4s, v28.4s, v29.4s // ...............................................................................*............................................................................................................................................|..........................................................................................................................................*............................. + // cmge v27.4s, v31.4s, v17.4s // .......................*....................................................................................................................................................................................................|..................................................................................*..................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ...............*............................................................................................................................................................................................................|..........................................................................*............................................................................................. + // sub v28.4s, v27.4s, v28.4s // .........................*..................................................................................................................................................................................................|....................................................................................*................................................................................... + // mls v17.4s, v28.4s, v29.4s // ........................................*...................................................................................................................................................................................|...................................................................................................*.................................................................... + // cmge v27.4s, v31.4s, v18.4s // ..............................................................*.............................................................................................................................................................|.........................................................................................................................*.............................................. + // cmge v28.4s, v18.4s, v30.4s // ................................................................*...........................................................................................................................................................|...........................................................................................................................*............................................ + // sub v28.4s, v27.4s, v28.4s // ...................................................................*........................................................................................................................................................|..............................................................................................................................*......................................... + // mls v18.4s, v28.4s, v29.4s // ........................................................................*...................................................................................................................................................|...................................................................................................................................*.................................... + // cmge v27.4s, v31.4s, v19.4s // .....................*......................................................................................................................................................................................................|................................................................................*....................................................................................... + // cmge v28.4s, v19.4s, v30.4s // ...........................*................................................................................................................................................................................................|......................................................................................*................................................................................. + // sub v28.4s, v27.4s, v28.4s // .............................*..............................................................................................................................................................................................|........................................................................................*............................................................................... + // mls v19.4s, v28.4s, v29.4s // ......................................*.....................................................................................................................................................................................|.................................................................................................*...................................................................... + // cmge v27.4s, v31.4s, v20.4s // .............................................................*..............................................................................................................................................................|........................................................................................................................*............................................... + // cmge v28.4s, v20.4s, v30.4s // ...........................................................*................................................................................................................................................................|......................................................................................................................*................................................. + // sub v28.4s, v27.4s, v28.4s // .................................................................*..........................................................................................................................................................|............................................................................................................................*........................................... + // mls v20.4s, v28.4s, v29.4s // .....................................................................*......................................................................................................................................................|................................................................................................................................*....................................... + // cmge v27.4s, v31.4s, v21.4s // .....................................................*......................................................................................................................................................................|................................................................................................................*....................................................... + // cmge v28.4s, v21.4s, v30.4s // .......................................................*....................................................................................................................................................................|..................................................................................................................*..................................................... + // sub v28.4s, v27.4s, v28.4s // ..........................................................*.................................................................................................................................................................|.....................................................................................................................*.................................................. + // mls v21.4s, v28.4s, v29.4s // ...........................................................................*................................................................................................................................................|......................................................................................................................................*................................. + // cmge v27.4s, v31.4s, v22.4s // ..........................................................................*.................................................................................................................................................|.....................................................................................................................................*.................................. + // cmge v28.4s, v22.4s, v30.4s // ............................................................................*...............................................................................................................................................|.......................................................................................................................................*................................ + // sub v28.4s, v27.4s, v28.4s // .....................................................................................*......................................................................................................................................|................................................................................................................................................*....................... + // mls v22.4s, v28.4s, v29.4s // .....................................................................................................*......................................................................................................................|................................................................................................................................................................*....... + // cmge v27.4s, v31.4s, v23.4s // .........................................................................*..................................................................................................................................................|....................................................................................................................................*................................... + // cmge v28.4s, v23.4s, v30.4s // .......................................................................*....................................................................................................................................................|..................................................................................................................................*..................................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................*.............................................................................................................................................|.........................................................................................................................................*.............................. + // mls v23.4s, v28.4s, v29.4s // ......................................................................................*.....................................................................................................................................|.................................................................................................................................................*...................... + // str q16, [x1], #(16) // ........................................................................................................*...................................................................................................................|...................................................................................................................................................................*.... + // str q17, [x1, #(-16 + 1*(512/8))] // ............................................*...............................................................................................................................................................................|.......................................................................................................*................................................................ + // str q18, [x1, #(-16 + 2*(512/8))] // .........................................................................................*..................................................................................................................................|....................................................................................................................................................*................... + // str q19, [x1, #(-16 + 3*(512/8))] // ..........................................*.................................................................................................................................................................................|.....................................................................................................*.................................................................. + // str q20, [x1, #(-16 + 4*(512/8))] // .............................................................................*..............................................................................................................................................|........................................................................................................................................*............................... + // str q21, [x1, #(-16 + 5*(512/8))] // ..........................................................................................*.................................................................................................................................|.....................................................................................................................................................*.................. + // str q22, [x1, #(-16 + 6*(512/8))] // ............................................................................................................*...............................................................................................................|.......................................................................................................................................................................* + // str q23, [x1, #(-16 + 7*(512/8))] // ...................................................................................................*........................................................................................................................|..............................................................................................................................................................*......... + + sub count, count, #1 + cbnz count, layer1234_start + sqrdmulh v28.4S, v28.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + add v23.4S, v27.4S, v23.4S // ....................................................................................................................................*................................................................................................................................................... + sqrdmulh v27.4S, v20.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mls v13.4S, v15.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + sub v15.4S, v11.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ + add v11.4S, v11.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... + mul v23.4S, v20.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + cmge v20.4S, v19.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v22.4S, v28.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + cmge v28.4S, v31.4S, v19.4S // ................................................................................................................................................................................*....................................................................................................... + mls v23.4S, v27.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + sub v28.4S, v28.4S, v20.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v20.4S, v15.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v19.4S, v28.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sqrdmulh v27.4S, v14.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mul v28.4S, v15.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + str q19, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v19.4S, v14.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + cmge v14.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + sqrdmulh v15.4S, v18.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mls v19.4S, v27.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + mul v18.4S, v18.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sub v14.4S, v27.4S, v14.4S // ......................................................................................................................................................................................*................................................................................................. + mls v18.4S, v15.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + add v15.4S, v16.4S, v19.4S // ............................................................................................................................................................................*........................................................................................................... + mls v17.4S, v14.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v16.4S, v16.4S, v19.4S // ...........................................................................................................................................................................*............................................................................................................ + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mls v28.4S, v20.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sub v27.4S, v23.4S, v22.4S // ......................................................................................................................................................................*................................................................................................................. + add v14.4S, v23.4S, v22.4S // .......................................................................................................................................................................*................................................................................................................ + cmge v22.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + sqrdmulh v20.4S, v21.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + cmge v23.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + mul v19.4S, v21.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sub v17.4S, v24.4S, v13.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v24.4S, v13.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v22.4S, v23.4S, v22.4S // ..........................................................................................................................................................................................*............................................................................................. + sqrdmulh v24.4S, v16.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + cmge v21.4S, v28.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mul v23.4S, v16.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + cmge v16.4S, v31.4S, v28.4S // ............................................................................................................................................................................................*........................................................................................... + mls v18.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + mul v22.4S, v27.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v27.4S, v27.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v18.4S, v17.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mls v19.4S, v20.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + mul v20.4S, v17.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + mls v20.4S, v18.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v18.4S, v31.4S, v19.4S // ................................................................................................................................................................................................*....................................................................................... + mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + cmge v24.4S, v19.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v22.4S, v27.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + sub v27.4S, v16.4S, v21.4S // ..............................................................................................................................................................................................*......................................................................................... + sub v18.4S, v18.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... + mls v28.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + cmge v27.4S, v31.4S, v20.4S // ....................................................................................................................................................................................................*................................................................................... + cmge v24.4S, v20.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + cmge v16.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + cmge v21.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + str q28, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + sub v27.4S, v27.4S, v24.4S // ......................................................................................................................................................................................................*................................................................................. + mls v19.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v18.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + sub v24.4S, v21.4S, v16.4S // ..............................................................................................................................................................................................................*......................................................................... + mls v20.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v27.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mls v23.4S, v24.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + str q19, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v18.4S, v18.4S, v27.4S // ..........................................................................................................................................................................................................*............................................................................. + sqrdmulh v27.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + str q20, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + mls v22.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + sqrdmulh v10.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v22.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v19.4S, v22.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + sqrdmulh v24.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + cmge v8.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + cmge v11.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v16.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sub v27.4S, v8.4S, v11.4S // ..............................................................................................................................................................................................................................................................*......................... + mls v18.4S, v10.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + cmge v10.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v8.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v12.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + cmge v23.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + sub v10.4S, v10.4S, v8.4S // ......................................................................................................................................................................................................................................................*................................. + cmge v8.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sub v11.4S, v11.4S, v23.4S // ..................................................................................................................................................................................................................................................*..................................... + sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + cmge v23.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + sqrdmulh v9.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + sub v8.4S, v8.4S, v23.4S // ..........................................................................................................................................................................................................................................................*............................. + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + mls v22.4S, v24.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mls v20.4S, v12.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v23.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v24.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + mls v17.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + cmge v10.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + cmge v15.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v13.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + sub v10.4S, v24.4S, v10.4S // ..........................................................................................................................................................................................................................................................................*............. + mls v16.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v24.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + sub v13.4S, v15.4S, v13.4S // ..................................................................................................................................................................................................................................................................*..................... + mls v18.4S, v8.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + cmge v15.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v8.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v20.4S, v13.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + cmge v13.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v15.4S, v24.4S, v15.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v19.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v18.4S, v8.4S, v13.4S // ......................................................................................................................................................................................................................................................................*................. + mls v23.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v22.4S, v10.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + mls v21.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_firestorm.s b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_firestorm.s new file mode 100644 index 00000000..ff52d0f4 --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_firestorm.s @@ -0,0 +1,2096 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_opt_m1_firestorm + .global _intt_dilithium_1234_5678_opt_m1_firestorm + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_opt_m1_firestorm: +_intt_dilithium_1234_5678_opt_m1_firestorm: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ld4 {v10.4S, v11.4S, v12.4S, v13.4S}, [x0] // *.......................................... + ldr q18, [x3, #32] // .*......................................... + ldr q22, [x3, #48] // ..*........................................ + ldr q26, [x3, #64] // ...*....................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q7, [x3, #80] // ....*...................................... + ldr q27, [x3, #16] // .........*................................. + ldr q15, [x3], #(6*16) // .................*......................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q19, [x4], #8 // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v30.4S, v10.4S, v11.4S // .....*..................................... + add v10.4S, v10.4S, v11.4S // .......*................................... + sub v24.4S, v12.4S, v13.4S // ......*.................................... + add v13.4S, v12.4S, v13.4S // ........*.................................. + ldr q11, [x4], #16 // .........................................*. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mul v18.4S, v30.4S, v18.4S // ..........*................................ + sqrdmulh v22.4S, v30.4S, v22.4S // ...........*............................... + mul v26.4S, v24.4S, v26.4S // ............*.............................. + sqrdmulh v7.4S, v24.4S, v7.4S // .............*............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v30.4S, v10.4S, v13.4S // ..............*............................ + add v10.4S, v10.4S, v13.4S // ...................*....................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v18.4S, v22.4S, v29.4S // ...............*........................... + mls v26.4S, v7.4S, v29.4S // ................*.......................... + sqrdmulh v22.4S, v30.4S, v27.4S // ..................*........................ + mul v7.4S, v30.4S, v15.4S // ......................*.................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v13.4S, v18.4S, v26.4S // ....................*...................... + add v18.4S, v18.4S, v26.4S // ........................*.................. + mls v7.4S, v22.4S, v29.4S // ..........................*................ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn2 v30.4S, v10.4S, v18.4S // ...........................*............... + sqrdmulh v22.4S, v13.4S, v27.4S // .....................*..................... + mul v26.4S, v13.4S, v15.4S // .......................*................... + trn1 v10.4S, v10.4S, v18.4S // ............................*.............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v26.4S, v22.4S, v29.4S // .........................*................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn1 v22.4S, v7.4S, v26.4S // .............................*............. + trn2 v18.4S, v7.4S, v26.4S // ..............................*............ + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn2 v28.2D, v30.2D, v18.2D // .................................*......... + trn1 v9.2D, v30.2D, v18.2D // ..................................*........ + trn1 v17.2D, v10.2D, v22.2D // ...............................*........... + trn2 v13.2D, v10.2D, v22.2D // ................................*.......... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v3.4S, v17.4S, v9.4S // ...................................*....... + add v2.4S, v13.4S, v28.4S // ....................................*...... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v10.4S, v3.4S, v2.4S // .....................................*..... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + srshr v18.4S, v10.4S, #23 // ......................................*.... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v10.4S, v18.4S, v29.4S // ........................................*.. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + str q10, [x0], #(16*4) // ..........................................* + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + + // original source code + // ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x0] // *.......................................... + // ldr q18, [x3, #32] // .*......................................... + // ldr q23, [x3, #48] // ..*........................................ + // ldr q12, [x3, #64] // ...*....................................... + // ldr q24, [x3, #80] // ....*...................................... + // sub v5.4S, v14.4S, v15.4S // ........*.................................. + // sub v6.4S, v16.4S, v17.4S // ..........*................................ + // add v21.4S, v14.4S, v15.4S // .........*................................. + // add v30.4S, v16.4S, v17.4S // ...........*............................... + // ldr q14, [x3, #16] // .....*..................................... + // mul v1.4S, v5.4S, v18.4S // .............*............................. + // sqrdmulh v0.4S, v5.4S, v23.4S // ..............*............................ + // mul v16.4S, v6.4S, v12.4S // ...............*........................... + // sqrdmulh v9.4S, v6.4S, v24.4S // ................*.......................... + // sub v6.4S, v21.4S, v30.4S // .................*......................... + // mls v1.4S, v0.4S, v29.4S // ...................*....................... + // mls v16.4S, v9.4S, v29.4S // ....................*...................... + // ldr q12, [x3], #(6*16) // ......*.................................... + // sqrdmulh v18.4S, v6.4S, v14.4S // .....................*..................... + // add v20.4S, v21.4S, v30.4S // ..................*........................ + // sub v31.4S, v1.4S, v16.4S // .......................*................... + // sqrdmulh v22.4S, v31.4S, v14.4S // ...........................*............... + // mul v15.4S, v6.4S, v12.4S // ......................*.................... + // mul v14.4S, v31.4S, v12.4S // ............................*.............. + // add v2.4S, v1.4S, v16.4S // ........................*.................. + // mls v14.4S, v22.4S, v29.4S // ..............................*............ + // mls v15.4S, v18.4S, v29.4S // .........................*................. + // trn2 v9.4S, v20.4S, v2.4S // ..........................*................ + // trn1 v2.4S, v20.4S, v2.4S // .............................*............. + // trn1 v18.4S, v15.4S, v14.4S // ...............................*........... + // trn2 v10.4S, v15.4S, v14.4S // ................................*.......... + // trn1 v17.2D, v2.2D, v18.2D // ...................................*....... + // trn2 v13.2D, v2.2D, v18.2D // ....................................*...... + // trn2 v28.2D, v9.2D, v10.2D // .................................*......... + // trn1 v9.2D, v9.2D, v10.2D // ..................................*........ + // add v3.4S, v17.4S, v9.4S // .....................................*..... + // add v2.4S, v13.4S, v28.4S // ......................................*.... + // add v21.4S, v3.4S, v2.4S // .......................................*... + // srshr v12.4S, v21.4S, #23 // ........................................*.. + // ldr q19, [x4], #8 // .......*................................... + // mls v21.4S, v12.4S, v29.4S // .........................................*. + // ldr q11, [x4], #16 // ............*.............................. + // str q21, [x0], #(16*4) // ..........................................* + + sub count, count, #1 +layer5678_start: + sub v26.4S, v17.4S, v9.4S // .....................................*........................... + // gap // ................................................................. + // gap // ................................................................. + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x0] // e................................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v4.4S, v26.4S, v11.S[0] // .......................................*......................... + sqrdmulh v8.4S, v26.4S, v11.S[1] // ........................................*........................ + ldr q18, [x3, #32] // ...e............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q23, [x3, #48] // ....e............................................................ + ldr q12, [x3, #64] // .....e........................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q24, [x3, #80] // ......e.......................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v2.4S, v3.4S, v2.4S // ...............................................*................. + mls v4.4S, v8.4S, v29.4S // .........................................*....................... + sub v8.4S, v13.4S, v28.4S // ..........................................*...................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v5.4S, v14.4S, v15.4S // .......e......................................................... + sub v6.4S, v16.4S, v17.4S // ............e.................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v28.4S, v8.4S, v11.S[3] // .............................................*................... + add v21.4S, v14.4S, v15.4S // ........e........................................................ + add v30.4S, v16.4S, v17.4S // .............e................................................... + ldr q14, [x3, #16] // ..e.............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v1.4S, v5.4S, v18.4S // .........e....................................................... + sqrdmulh v0.4S, v5.4S, v23.4S // ..........e...................................................... + mul v16.4S, v6.4S, v12.4S // ..............e.................................................. + sqrdmulh v9.4S, v6.4S, v24.4S // ...............e................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v23.4S, v8.4S, v11.S[2] // ............................................*.................... + sqrdmulh v17.4S, v2.4S, v19.S[1] // ..................................................*.............. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v6.4S, v21.4S, v30.4S // .................e............................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v1.4S, v0.4S, v29.4S // ...........e..................................................... + mls v16.4S, v9.4S, v29.4S // ................e................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v23.4S, v28.4S, v29.4S // ..............................................*.................. + // gap // ................................................................. + // gap // ................................................................. + ldr q12, [x3], #(6*16) // .e............................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v18.4S, v6.4S, v14.4S // ....................e............................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v20.4S, v21.4S, v30.4S // ..................e.............................................. + sub v31.4S, v1.4S, v16.4S // ......................e.......................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v28.4S, v4.4S, v23.4S // .....................................................*........... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v22.4S, v31.4S, v14.4S // .........................e....................................... + mul v15.4S, v6.4S, v12.4S // ...................e............................................. + mul v14.4S, v31.4S, v12.4S // ........................e........................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v25.4S, v4.4S, v23.4S // ....................................................*............ + mul v12.4S, v2.4S, v19.S[0] // .................................................*............... + srshr v6.4S, v28.4S, #23 // ...........................................................*..... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v2.4S, v1.4S, v16.4S // .......................e......................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v14.4S, v22.4S, v29.4S // ..........................e...................................... + mls v15.4S, v18.4S, v29.4S // .....................e........................................... + sqrdmulh v31.4S, v25.4S, v19.S[1] // .......................................................*......... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v12.4S, v17.4S, v29.4S // ...................................................*............. + mls v28.4S, v6.4S, v29.4S // ............................................................*.... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn2 v9.4S, v20.4S, v2.4S // ............................e.................................... + trn1 v2.4S, v20.4S, v2.4S // ...........................e..................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn1 v18.4S, v15.4S, v14.4S // .............................e................................... + trn2 v10.4S, v15.4S, v14.4S // ..............................e.................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q12, [x0, #-32] // ...............................................................*. + mul v12.4S, v25.4S, v19.S[0] // ......................................................*.......... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn1 v17.2D, v2.2D, v18.2D // .................................e............................... + trn2 v13.2D, v2.2D, v18.2D // ...............................e................................. + str q28, [x0, #-48] // ..............................................................*.. + trn2 v28.2D, v9.2D, v10.2D // ................................e................................ + trn1 v9.2D, v9.2D, v10.2D // ..................................e.............................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v3.4S, v17.4S, v9.4S // ......................................e.......................... + add v2.4S, v13.4S, v28.4S // ...........................................e..................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v12.4S, v31.4S, v29.4S // ........................................................*........ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + add v21.4S, v3.4S, v2.4S // ................................................e................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q12, [x0, #-16] // ................................................................* + srshr v12.4S, v21.4S, #23 // .........................................................e....... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q19, [x4], #8 // ...................................e............................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v21.4S, v12.4S, v29.4S // ..........................................................e...... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q11, [x4], #16 // ....................................e............................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q21, [x0], #(16*4) // .............................................................e... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e...............................................................|e.......................................................... + // ldr q0, [x3], #(6*16) // ..........................e.....................................|..........................e................................ + // ldr q4, [x3, #(-6*16 + 1*16)] // ...............e................................................|...............e........................................... + // ldr q1, [x3, #(-6*16 + 2*16)] // ...e............................................................|...e....................................................... + // ldr q5, [x3, #(-6*16 + 3*16)] // ....e...........................................................|....e...................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // .....e..........................................................|.....e..................................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // ......e.........................................................|......e.................................................... + // sub v24.4s, v8.4s, v9.4s // ..........e.....................................................|..........e................................................ + // add v8.4s, v8.4s, v9.4s // .............e..................................................|.............e............................................. + // mul v9.4s, v24.4s, v1.4s // ................e...............................................|................e.......................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .................e..............................................|.................e......................................... + // mls v9.4s, v24.4s, v29.4s // .......................e........................................|.......................e................................... + // sub v24.4s, v10.4s, v11.4s // ...........e....................................................|...........e............................................... + // add v10.4s, v10.4s, v11.4s // ..............e.................................................|..............e............................................ + // mul v11.4s, v24.4s, v2.4s // ..................e.............................................|..................e........................................ + // sqrdmulh v24.4s, v24.4s, v6.4s // ...................e............................................|...................e....................................... + // mls v11.4s, v24.4s, v29.4s // ........................e.......................................|........................e.................................. + // sub v24.4s, v8.4s, v10.4s // ......................e.........................................|......................e.................................... + // add v8.4s, v8.4s, v10.4s // ............................e...................................|............................e.............................. + // mul v10.4s, v24.4s, v0.4s // ................................e...............................|................................e.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...........................e....................................|...........................e............................... + // mls v10.4s, v24.4s, v29.4s // .......................................e........................|.......................................e................... + // sub v24.4s, v9.4s, v11.4s // .............................e..................................|.............................e............................. + // add v9.4s, v9.4s, v11.4s // .....................................e..........................|.....................................e..................... + // mul v11.4s, v24.4s, v0.4s // .................................e..............................|.................................e......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................e................................|...............................e........................... + // mls v11.4s, v24.4s, v29.4s // ......................................e.........................|......................................e.................... + // trn1 v25.4s, v8.4s, v9.4s // ............................................e...................|............................................e.............. + // trn2 v26.4s, v8.4s, v9.4s // ...........................................e....................|...........................................e............... + // trn1 v27.4s, v10.4s, v11.4s // .............................................e..................|.............................................e............. + // trn2 v28.4s, v10.4s, v11.4s // ..............................................e.................|..............................................e............ + // trn2 v10.2d, v25.2d, v27.2d // ..................................................e.............|..................................................e........ + // trn2 v11.2d, v26.2d, v28.2d // ....................................................e...........|....................................................e...... + // trn1 v8.2d, v25.2d, v27.2d // .................................................e..............|.................................................e......... + // trn1 v9.2d, v26.2d, v28.2d // .....................................................e..........|.....................................................e..... + // ldr q1, [x4], #8 // ............................................................e...|........................................................... + // ldr q0, [x4], #16 // ..............................................................e.|........................................................... + // sub v24.4s, v8.4s, v9.4s // ................................................................*........................................................... + // add v8.4s, v8.4s, v9.4s // ......................................................e.........|......................................................e.... + // mul v9.4s, v24.4s, v0.s[0] // .*..............................................................|.*......................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..*.............................................................|..*........................................................ + // mls v9.4s, v24.4s, v29.4s // ........*.......................................................|........*.................................................. + // sub v24.4s, v10.4s, v11.4s // .........*......................................................|.........*................................................. + // add v10.4s, v10.4s, v11.4s // .......................................................e........|.......................................................e... + // mul v11.4s, v24.4s, v0.s[2] // ....................*...........................................|....................*...................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ............*...................................................|............*.............................................. + // mls v11.4s, v24.4s, v29.4s // .........................*......................................|.........................*................................. + // sub v24.4s, v8.4s, v10.4s // .......*........................................................|.......*................................................... + // add v8.4s, v8.4s, v10.4s // .........................................................e......|.........................................................e. + // mul v10.4s, v24.4s, v1.s[0] // ...................................*............................|...................................*....................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................*..........................................|.....................*..................................... + // mls v10.4s, v24.4s, v29.4s // .........................................*......................|.........................................*................. + // sub v24.4s, v9.4s, v11.4s // ..................................*.............................|..................................*........................ + // add v9.4s, v9.4s, v11.4s // ..............................*.................................|..............................*............................ + // mul v11.4s, v24.4s, v1.s[0] // ................................................*...............|................................................*.......... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................*.......................|........................................*.................. + // mls v11.4s, v24.4s, v29.4s // ........................................................*.......|........................................................*.. + // srshr v24.4S, v8.4S, #23 // ...........................................................e....|........................................................... + // mls v8.4s, v24.4s, v29.4s // .............................................................e..|........................................................... + // srshr v24.4S, v9.4S, #23 // ....................................*...........................|....................................*...................... + // mls v9.4s, v24.4s, v29.4s // ..........................................*.....................|..........................................*................ + // str q8, [x0], #(16*4) // ...............................................................e|........................................................... + // str q9, [x0, #(-16*4 + 1*16)] // ...................................................*............|...................................................*....... + // str q10, [x0, #(-16*4 + 2*16)] // ...............................................*................|...............................................*........... + // str q11, [x0, #(-16*4 + 3*16)] // ..........................................................*.....|..........................................................* + + sub count, count, #1 + cbnz count, layer5678_start + sub v10.4S, v17.4S, v9.4S // *..................... + sub v18.4S, v3.4S, v2.4S // ...*.................. + sub v22.4S, v13.4S, v28.4S // .....*................ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mul v26.4S, v10.4S, v11.S[0] // .*.................... + sqrdmulh v10.4S, v10.4S, v11.S[1] // ..*................... + sqrdmulh v7.4S, v22.4S, v11.S[3] // ......*............... + mul v22.4S, v22.4S, v11.S[2] // .......*.............. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqrdmulh v30.4S, v18.4S, v19.S[1] // ........*............. + mul v18.4S, v18.4S, v19.S[0] // ............*......... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v26.4S, v10.4S, v29.4S // ....*................. + mls v22.4S, v7.4S, v29.4S // .........*............ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v18.4S, v30.4S, v29.4S // ...............*...... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sub v10.4S, v26.4S, v22.4S // ...........*.......... + add v22.4S, v26.4S, v22.4S // ..........*........... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q18, [x0, #-32] // .................*.... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqrdmulh v18.4S, v10.4S, v19.S[1] // ..............*....... + mul v10.4S, v10.4S, v19.S[0] // ..................*... + srshr v26.4S, v22.4S, #23 // .............*........ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v22.4S, v26.4S, v29.4S // ................*..... + mls v10.4S, v18.4S, v29.4S // ....................*. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q22, [x0, #-48] // ...................*.. + str q10, [x0, #-16] // .....................* + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + + // original source code + // sub v26.4S, v17.4S, v9.4S // *..................... + // mul v4.4S, v26.4S, v11.S[0] // ...*.................. + // sqrdmulh v8.4S, v26.4S, v11.S[1] // ....*................. + // sub v2.4S, v3.4S, v2.4S // .*.................... + // mls v4.4S, v8.4S, v29.4S // .........*............ + // sub v8.4S, v13.4S, v28.4S // ..*................... + // sqrdmulh v28.4S, v8.4S, v11.S[3] // .....*................ + // mul v23.4S, v8.4S, v11.S[2] // ......*............... + // sqrdmulh v17.4S, v2.4S, v19.S[1] // .......*.............. + // mls v23.4S, v28.4S, v29.4S // ..........*........... + // add v28.4S, v4.4S, v23.4S // .............*........ + // sub v25.4S, v4.4S, v23.4S // ............*......... + // mul v12.4S, v2.4S, v19.S[0] // ........*............. + // srshr v6.4S, v28.4S, #23 // .................*.... + // sqrdmulh v31.4S, v25.4S, v19.S[1] // ...............*...... + // mls v12.4S, v17.4S, v29.4S // ...........*.......... + // mls v28.4S, v6.4S, v29.4S // ..................*... + // str q12, [x0, #-32] // ..............*....... + // mul v12.4S, v25.4S, v19.S[0] // ................*..... + // str q28, [x0, #-48] // ....................*. + // mls v12.4S, v31.4S, v29.4S // ...................*.. + // str q12, [x0, #-16] // .....................* + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q11, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + ldr q9, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q12, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q22, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + ldr q20, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + ldr q18, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + ldr q27, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q24, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q21, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + ldr q15, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + ldr q14, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + sub v10.4S, v12.4S, v9.4S // ..........................*............................................................................................................................................................................................................................................................. + ldr q19, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + sub v16.4S, v11.4S, v22.4S // .....................*.................................................................................................................................................................................................................................................................. + add v8.4S, v11.4S, v22.4S // ......................*................................................................................................................................................................................................................................................................. + sub v22.4S, v20.4S, v18.4S // ................*....................................................................................................................................................................................................................................................................... + sub v11.4S, v27.4S, v24.4S // ....................................*................................................................................................................................................................................................................................................... + add v17.4S, v27.4S, v24.4S // .....................................*.................................................................................................................................................................................................................................................. + mul v13.4S, v10.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v10.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + add v24.4S, v12.4S, v9.4S // ...........................*............................................................................................................................................................................................................................................................ + add v18.4S, v20.4S, v18.4S // .................*...................................................................................................................................................................................................................................................................... + sub v28.4S, v14.4S, v21.4S // .........................................*.............................................................................................................................................................................................................................................. + sqrdmulh v23.4S, v16.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v11.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + mul v20.4S, v11.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + mul v11.4S, v16.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + sub v12.4S, v19.4S, v15.4S // ...............................*........................................................................................................................................................................................................................................................ + add v14.4S, v14.4S, v21.4S // ..........................................*............................................................................................................................................................................................................................................. + mul v21.4S, v28.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v28.4S, v28.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + mls v13.4S, v10.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + ldr q10, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + ldr q16, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v12.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + add v19.4S, v19.4S, v15.4S // ................................*....................................................................................................................................................................................................................................................... + mul v15.4S, v12.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + ldr q12, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + mls v11.4S, v23.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + ldr q23, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + mls v20.4S, v9.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sub v9.4S, v18.4S, v8.4S // ........................................................*............................................................................................................................................................................................................................... + add v8.4S, v18.4S, v8.4S // .........................................................*.............................................................................................................................................................................................................................. + mul v18.4S, v22.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + mls v21.4S, v28.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + add v28.4S, v24.4S, v19.4S // ...................................................................*.................................................................................................................................................................................................................... + sub v24.4S, v24.4S, v19.4S // ..................................................................*..................................................................................................................................................................................................................... + sqrdmulh v19.4S, v22.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + mls v15.4S, v27.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sub v27.4S, v16.4S, v10.4S // ..............................................*......................................................................................................................................................................................................................................... + add v22.4S, v16.4S, v10.4S // ...............................................*........................................................................................................................................................................................................................................ + sqrdmulh v16.4S, v9.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + mul v10.4S, v9.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + add v9.4S, v12.4S, v23.4S // ....................................................*................................................................................................................................................................................................................................... + sub v12.4S, v12.4S, v23.4S // ...................................................*.................................................................................................................................................................................................................................... + sub v23.4S, v20.4S, v21.4S // .................................................................................*...................................................................................................................................................................................................... + mls v18.4S, v19.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + add v21.4S, v20.4S, v21.4S // ..................................................................................*..................................................................................................................................................................................................... + mls v10.4S, v16.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sqrdmulh v19.4S, v24.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + mul v16.4S, v24.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + mul v24.4S, v12.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + sqrdmulh v20.4S, v12.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + add v12.4S, v22.4S, v9.4S // .......................................................................................*................................................................................................................................................................................................ + sub v22.4S, v22.4S, v9.4S // ......................................................................................*................................................................................................................................................................................................. + add v9.4S, v18.4S, v11.4S // ..............................................................*......................................................................................................................................................................................................................... + sub v11.4S, v18.4S, v11.4S // .............................................................*.......................................................................................................................................................................................................................... + add v18.4S, v13.4S, v15.4S // ........................................................................*............................................................................................................................................................................................................... + mls v16.4S, v19.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + mls v24.4S, v20.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + sub v15.4S, v13.4S, v15.4S // .......................................................................*................................................................................................................................................................................................................ + add v19.4S, v17.4S, v14.4S // .............................................................................*.......................................................................................................................................................................................................... + sqrdmulh v20.4S, v11.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + sub v17.4S, v17.4S, v14.4S // ............................................................................*........................................................................................................................................................................................................... + mul v14.4S, v11.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + mul v13.4S, v27.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + sqrdmulh v11.4S, v27.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + mul v27.4S, v22.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + sqrdmulh v22.4S, v22.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + mls v14.4S, v20.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + sqrdmulh v20.4S, v15.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + mul v15.4S, v15.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + mls v27.4S, v22.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + add v22.4S, v19.4S, v12.4S // .....................................................................................................................*.................................................................................................................................................................. + sub v19.4S, v19.4S, v12.4S // ....................................................................................................................*................................................................................................................................................................... + sub v12.4S, v8.4S, v28.4S // ................................................................................................*....................................................................................................................................................................................... + add v8.4S, v8.4S, v28.4S // .................................................................................................*...................................................................................................................................................................................... + sub v28.4S, v10.4S, v16.4S // ..........................................................................................................*............................................................................................................................................................................. + add v10.4S, v10.4S, v16.4S // ...........................................................................................................*............................................................................................................................................................................ + sqrdmulh v16.4S, v23.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + mul v23.4S, v23.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + mls v13.4S, v11.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + sub v11.4S, v8.4S, v22.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v22.4S // .........................................................................................................................................*.............................................................................................................................................. + mul v22.4S, v17.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mls v15.4S, v20.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + mul v20.4S, v19.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + sqrdmulh v19.4S, v19.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mls v23.4S, v16.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + add v16.4S, v13.4S, v24.4S // ............................................................................................*........................................................................................................................................................................................... + mls v22.4S, v17.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + sub v24.4S, v13.4S, v24.4S // ...........................................................................................*............................................................................................................................................................................................ + sqrdmulh v13.4S, v11.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + add v17.4S, v21.4S, v16.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v21.4S, v21.4S, v16.4S // .........................................................................................................................*.............................................................................................................................................................. + mul v16.4S, v11.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sub v11.4S, v9.4S, v18.4S // .....................................................................................................*.................................................................................................................................................................................. + mls v20.4S, v19.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + add v9.4S, v9.4S, v18.4S // ......................................................................................................*................................................................................................................................................................................. + add v18.4S, v22.4S, v27.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v22.4S, v22.4S, v27.4S // ..............................................................................................................................*......................................................................................................................................................... + sub v27.4S, v14.4S, v15.4S // ...............................................................................................................*........................................................................................................................................................................ + add v14.4S, v14.4S, v15.4S // ................................................................................................................*....................................................................................................................................................................... + sqrdmulh v19.4S, v12.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sub v15.4S, v9.4S, v17.4S // .............................................................................................................................................*.......................................................................................................................................... + add v9.4S, v9.4S, v17.4S // ..............................................................................................................................................*......................................................................................................................................... + sqrdmulh v17.4S, v24.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mul v24.4S, v24.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + mls v16.4S, v13.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + sqrdmulh v13.4S, v22.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mls v12.4S, v19.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + sub v19.4S, v10.4S, v18.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v18.4S // ...................................................................................................................................................*.................................................................................................................................... + mul v18.4S, v22.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + mls v24.4S, v17.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + mul v17.4S, v28.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + sqrdmulh v22.4S, v28.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + mls v18.4S, v13.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mul v13.4S, v11.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + sqrdmulh v11.4S, v11.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mls v17.4S, v22.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + sqrdmulh v22.4S, v27.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + sub v28.4S, v23.4S, v24.4S // ...................................................................................................................................*.................................................................................................................................................... + add v23.4S, v23.4S, v24.4S // ....................................................................................................................................*................................................................................................................................................... + mul v24.4S, v27.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + mul v27.4S, v21.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + mls v13.4S, v11.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sqrdmulh v21.4S, v21.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + sub count, count, #1 +layer1234_start: + mls v24.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + cmge v22.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + mls v27.4S, v21.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + cmge v21.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + add v11.4S, v14.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... + sub v23.4S, v14.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ + add v14.4S, v17.4S, v18.4S // .......................................................................................................................................................................*................................................................................................................ + sub v18.4S, v17.4S, v18.4S // ......................................................................................................................................................................*................................................................................................................. + sub v17.4S, v21.4S, v22.4S // ..................................................................................................................................................................................*..................................................................................................... + mul v22.4S, v18.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + sqrdmulh v21.4S, v18.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + mls v16.4S, v17.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sqrdmulh v17.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v19.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v23.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v22.4S, v21.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + mul v21.4S, v28.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + sqrdmulh v28.4S, v28.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mls v18.4S, v17.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mls v19.4S, v23.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + mul v17.4S, v15.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + sqrdmulh v23.4S, v15.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sub v16.4S, v13.4S, v27.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v13.4S, v27.4S // ..................................................................................................................................................................*..................................................................................................................... + sub v27.4S, v12.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... + mls v17.4S, v23.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + mls v21.4S, v28.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + cmge v15.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + cmge v20.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + cmge v23.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + sub v15.4S, v15.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + mul v20.4S, v27.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v27.4S, v27.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + sub v23.4S, v23.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + sub v28.4S, v24.4S, v21.4S // ...........................................................................................................................................................................*............................................................................................................ + mls v18.4S, v15.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + add v15.4S, v24.4S, v21.4S // ............................................................................................................................................................................*........................................................................................................... + cmge v24.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v21.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v20.4S, v27.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sqrdmulh v27.4S, v28.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + mul v23.4S, v28.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sub v28.4S, v24.4S, v21.4S // ..........................................................................................................................................................................................................*............................................................................. + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + mul v24.4S, v16.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sqrdmulh v18.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v13.4S, v16.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sqrdmulh v8.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + mls v22.4S, v28.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mls v16.4S, v8.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sqrdmulh v8.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v23.4S, v27.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + mls v21.4S, v18.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v18.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v24.4S, v13.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v13.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v28.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v19.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + sub v8.4S, v27.4S, v18.4S // ......................................................................................................................................................................................*................................................................................................. + cmge v11.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v18.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + sub v27.4S, v13.4S, v28.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v28.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v13.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + sub v11.4S, v11.4S, v18.4S // ..............................................................................................................................................................................................................*......................................................................... + mls v17.4S, v8.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + sqrdmulh v8.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + cmge v14.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v10.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + mls v20.4S, v27.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + cmge v27.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v23.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + sub v11.4S, v10.4S, v14.4S // ..............................................................................................................................................................................................................................................................*......................... + sub v10.4S, v28.4S, v27.4S // ..................................................................................................................................................................................................................................................*..................................... + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + mls v22.4S, v13.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + mls v16.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sqrdmulh v10.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + cmge v13.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sqrdmulh v28.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + cmge v15.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + cmge v12.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v17.4S, v10.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + cmge v10.4S, v24.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v16.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + sub v27.4S, v13.4S, v15.4S // ......................................................................................................................................................................................................................................................................*................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + ldr q28, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + mls v18.4S, v8.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sub v13.4S, v12.4S, v16.4S // ..........................................................................................................................................................................................................................................................................*............. + ldr q12, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + sub v15.4S, v12.4S, v28.4S // .....................e.................................................................................................................................................................................................................................................................. + cmge v9.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + mls v19.4S, v11.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + cmge v11.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v16.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v8.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + cmge v14.4S, v31.4S, v24.4S // ....................................................................................................................................................................................................*................................................................................... + mls v22.4S, v13.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + ldr q13, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + add v28.4S, v12.4S, v28.4S // ......................e................................................................................................................................................................................................................................................................. + sub v12.4S, v9.4S, v11.4S // ..............................................................................................................................................................................................................................................................................*......... + cmge v9.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + ldr q11, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... + sub v8.4S, v16.4S, v8.4S // ..................................................................................................................................................................................................................................................................*..................... + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + mls v21.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sub v27.4S, v14.4S, v10.4S // ......................................................................................................................................................................................................*................................................................................. + cmge v16.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + ldr q14, [x1, #320] // .....e.................................................................................................................................................................................................................................................................................. + ldr q19, [x1, #256] // ....e................................................................................................................................................................................................................................................................................... + mls v23.4S, v12.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + sub v12.4S, v9.4S, v16.4S // ......................................................................................................................................................................................................................................................*................................. + mls v20.4S, v8.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v24.4S, v27.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + sub v16.4S, v13.4S, v11.4S // ................e....................................................................................................................................................................................................................................................................... + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + cmge v21.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + add v9.4S, v19.4S, v14.4S // ...........................e............................................................................................................................................................................................................................................................ + add v8.4S, v13.4S, v11.4S // .................e...................................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v15.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + str q24, [x1, #816] // .....................................................................................................................................................................................................................*.................................................................. + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + ldr q20, [x1, #384] // ......e................................................................................................................................................................................................................................................................................. + cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + sub v13.4S, v19.4S, v14.4S // ..........................e............................................................................................................................................................................................................................................................. + mul v11.4S, v15.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + sqrdmulh v19.4S, v16.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + ldr q15, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + mul v16.4S, v16.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + ldr q23, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + ldr q22, [x1, #448] // .......e................................................................................................................................................................................................................................................................................ + sub v24.4S, v27.4S, v21.4S // ..........................................................................................................................................................................................................................................................*............................. + mls v17.4S, v12.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + add v21.4S, v8.4S, v28.4S // .........................................................e.............................................................................................................................................................................................................................. + ldr q14, [x1, #512] // ........e............................................................................................................................................................................................................................................................................... + sub v12.4S, v8.4S, v28.4S // ........................................................e............................................................................................................................................................................................................................... + ldr q28, [x1, #576] // .........e.............................................................................................................................................................................................................................................................................. + sqrdmulh v27.4S, v13.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + mul v13.4S, v13.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + mls v16.4S, v19.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + mls v18.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + sqrdmulh v19.4S, v12.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mls v11.4S, v10.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + mls v13.4S, v27.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + add v8.4S, v20.4S, v22.4S // ................................e....................................................................................................................................................................................................................................................... + sub v17.4S, v20.4S, v22.4S // ...............................e........................................................................................................................................................................................................................................................ + mul v10.4S, v12.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + add v24.4S, v14.4S, v28.4S // .....................................e.................................................................................................................................................................................................................................................. + sub v22.4S, v15.4S, v23.4S // .........................................e.............................................................................................................................................................................................................................................. + add v23.4S, v15.4S, v23.4S // ..........................................e............................................................................................................................................................................................................................................. + sub v28.4S, v14.4S, v28.4S // ....................................e................................................................................................................................................................................................................................................... + add v14.4S, v9.4S, v8.4S // ...................................................................e.................................................................................................................................................................................................................... + sub v27.4S, v9.4S, v8.4S // ..................................................................e..................................................................................................................................................................................................................... + mul v20.4S, v17.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + sqrdmulh v12.4S, v17.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + mul v8.4S, v28.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + sqrdmulh v15.4S, v28.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + sub v17.4S, v16.4S, v11.4S // .............................................................e.......................................................................................................................................................................................................................... + mls v10.4S, v19.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + ldr q19, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + ldr q28, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + add v18.4S, v16.4S, v11.4S // ..............................................................e......................................................................................................................................................................................................................... + mul v9.4S, v27.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + add v16.4S, v21.4S, v14.4S // .................................................................................................e...................................................................................................................................................................................... + sqrdmulh v27.4S, v27.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + sub v11.4S, v24.4S, v23.4S // ............................................................................e........................................................................................................................................................................................................... + add v24.4S, v24.4S, v23.4S // .............................................................................e.......................................................................................................................................................................................................... + mul v23.4S, v22.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + sqrdmulh v22.4S, v22.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + mls v20.4S, v12.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + mls v8.4S, v15.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + mls v9.4S, v27.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + ldr q27, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + sub v12.4S, v21.4S, v14.4S // ................................................................................................e....................................................................................................................................................................................... + ldr q15, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + sub v14.4S, v19.4S, v28.4S // ...................................................e.................................................................................................................................................................................................................................... + mls v23.4S, v22.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + add v28.4S, v19.4S, v28.4S // ....................................................e................................................................................................................................................................................................................................... + add v19.4S, v13.4S, v20.4S // ........................................................................e............................................................................................................................................................................................................... + sub v13.4S, v13.4S, v20.4S // .......................................................................e................................................................................................................................................................................................................ + mul v21.4S, v14.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + sqrdmulh v22.4S, v14.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mul v14.4S, v11.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + sqrdmulh v20.4S, v11.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + add v11.4S, v8.4S, v23.4S // ..................................................................................e..................................................................................................................................................................................................... + sub v23.4S, v8.4S, v23.4S // .................................................................................e...................................................................................................................................................................................................... + add v8.4S, v15.4S, v27.4S // ...............................................e........................................................................................................................................................................................................................................ + sub v27.4S, v15.4S, v27.4S // ..............................................e......................................................................................................................................................................................................................................... + mls v21.4S, v22.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mul v15.4S, v13.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + sqrdmulh v22.4S, v13.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + mls v14.4S, v20.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + add v13.4S, v8.4S, v28.4S // .......................................................................................e................................................................................................................................................................................................ + sub v8.4S, v8.4S, v28.4S // ......................................................................................e................................................................................................................................................................................................. + sqrdmulh v28.4S, v27.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + mul v27.4S, v27.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + sub v20.4S, v24.4S, v13.4S // ....................................................................................................................e................................................................................................................................................................... + add v13.4S, v24.4S, v13.4S // .....................................................................................................................e.................................................................................................................................................................. + mls v15.4S, v22.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + mls v27.4S, v28.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + sqrdmulh v22.4S, v20.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mul v20.4S, v20.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + sqrdmulh v24.4S, v23.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + mul v23.4S, v23.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + add v28.4S, v18.4S, v19.4S // ......................................................................................................e................................................................................................................................................................................. + sub v19.4S, v18.4S, v19.4S // .....................................................................................................e.................................................................................................................................................................................. + mul v18.4S, v8.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + mls v20.4S, v22.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + sub v22.4S, v27.4S, v21.4S // ...........................................................................................e............................................................................................................................................................................................ + add v27.4S, v27.4S, v21.4S // ............................................................................................e........................................................................................................................................................................................... + sqrdmulh v21.4S, v8.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + mls v23.4S, v24.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + mul v24.4S, v17.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sqrdmulh v8.4S, v17.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + sub v17.4S, v10.4S, v9.4S // ..........................................................................................................e............................................................................................................................................................................. + add v10.4S, v10.4S, v9.4S // ...........................................................................................................e............................................................................................................................................................................ + sqrdmulh v9.4S, v22.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + mul v22.4S, v22.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + mls v18.4S, v21.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + sqrdmulh v21.4S, v19.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + mls v24.4S, v8.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + add v8.4S, v16.4S, v13.4S // .........................................................................................................................................e.............................................................................................................................................. + sub v16.4S, v16.4S, v13.4S // ........................................................................................................................................e............................................................................................................................................... + mul v13.4S, v19.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + sqrdmulh v19.4S, v17.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... + mul v17.4S, v17.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... + mls v13.4S, v21.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + mls v22.4S, v9.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + sqrdmulh v9.4S, v16.4S, v0.S[1] // ...........................................................................................................................................e............................................................................................................................................ + mul v16.4S, v16.4S, v0.S[0] // ..........................................................................................................................................e............................................................................................................................................. + mls v17.4S, v19.4S, v29.4S // ..............................................................................................................e......................................................................................................................................................................... + add v21.4S, v14.4S, v18.4S // ...............................................................................................................................e........................................................................................................................................................ + sub v18.4S, v14.4S, v18.4S // ..............................................................................................................................e......................................................................................................................................................... + add v14.4S, v24.4S, v15.4S // ................................................................................................................e....................................................................................................................................................................... + sub v19.4S, v24.4S, v15.4S // ...............................................................................................................e........................................................................................................................................................................ + add v15.4S, v11.4S, v27.4S // ..........................................................................................................................e............................................................................................................................................................. + sqrdmulh v24.4S, v12.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + mul v12.4S, v12.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + mls v16.4S, v9.4S, v29.4S // ............................................................................................................................................e........................................................................................................................................... + sub v11.4S, v11.4S, v27.4S // .........................................................................................................................e.............................................................................................................................................................. + sqrdmulh v27.4S, v18.4S, v1.S[1] // .................................................................................................................................e...................................................................................................................................................... + mul v18.4S, v18.4S, v1.S[0] // ................................................................................................................................e....................................................................................................................................................... + add v9.4S, v28.4S, v15.4S // ..............................................................................................................................................e......................................................................................................................................... + sub v15.4S, v28.4S, v15.4S // .............................................................................................................................................e.......................................................................................................................................... + sub v28.4S, v23.4S, v22.4S // ...................................................................................................................................e.................................................................................................................................................... + add v23.4S, v23.4S, v22.4S // ....................................................................................................................................e................................................................................................................................................... + mls v12.4S, v24.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + mul v24.4S, v19.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + sqrdmulh v22.4S, v19.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... + sub v19.4S, v10.4S, v21.4S // ..................................................................................................................................................e..................................................................................................................................... + add v10.4S, v10.4S, v21.4S // ...................................................................................................................................................e.................................................................................................................................... + sqrdmulh v21.4S, v11.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mls v18.4S, v27.4S, v29.4S // ..................................................................................................................................e..................................................................................................................................................... + mul v27.4S, v11.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + + // original source code + // ldr q8, [x1, #0] // ............e...............................................................................................................................................................|.......................................................................................................................e................................................. + // ldr q9, [x1, #(1*(512/8))] // ................e...........................................................................................................................................................|...........................................................................................................................e............................................. + // ldr q10, [x1, #(2*(512/8))] // ...e........................................................................................................................................................................|..............................................................................................................e.......................................................... + // ldr q11, [x1, #(3*(512/8))] // e...........................................................................................................................................................................|...........................................................................................................e............................................................. + // ldr q12, [x1, #(4*(512/8))] // .......................e....................................................................................................................................................|..................................................................................................................................e...................................... + // ldr q13, [x1, #(5*(512/8))] // ......................e.....................................................................................................................................................|.................................................................................................................................e....................................... + // ldr q14, [x1, #(6*(512/8))] // .....................................e......................................................................................................................................|................................................................................................................................................e........................ + // ldr q15, [x1, #(7*(512/8))] // ..............................................e.............................................................................................................................|.........................................................................................................................................................e............... + // ldr q16, [x1, #(8*(512/8))] // ..................................................e.........................................................................................................................|.............................................................................................................................................................e........... + // ldr q17, [x1, #(9*(512/8))] // ....................................................e.......................................................................................................................|...............................................................................................................................................................e......... + // ldr q18, [x1, #(10*(512/8))] // ...........................................e................................................................................................................................|......................................................................................................................................................e.................. + // ldr q19, [x1, #(11*(512/8))] // .............................................e..............................................................................................................................|........................................................................................................................................................e................ + // ldr q20, [x1, #(12*(512/8))] // ............................................................................................e...............................................................................|......................................................................................................................................................................... + // ldr q21, [x1, #(13*(512/8))] // ..........................................................................................e.................................................................................|......................................................................................................................................................................... + // ldr q22, [x1, #(14*(512/8))] // .............................................................................e..............................................................................................|......................................................................................................................................................................... + // ldr q23, [x1, #(15*(512/8))] // ..............................................................................e.............................................................................................|......................................................................................................................................................................... + // sub v24.4s, v8.4s, v9.4s // .............................e..............................................................................................................................................|........................................................................................................................................e................................ + // add v8.4s, v8.4s, v9.4s // .................................e..........................................................................................................................................|............................................................................................................................................e............................ + // mul v9.4s, v24.4s, v3.s[2] // ............................................e...............................................................................................................................|.......................................................................................................................................................e................. + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ..........................................e.................................................................................................................................|.....................................................................................................................................................e................... + // mls v9.4s, v24.4s, v29.4s // .......................................................e....................................................................................................................|..................................................................................................................................................................e...... + // sub v24.4s, v10.4s, v11.4s // ....e.......................................................................................................................................................................|...............................................................................................................e......................................................... + // add v10.4s, v10.4s, v11.4s // .............e..............................................................................................................................................................|........................................................................................................................e................................................ + // mul v11.4s, v24.4s, v4.s[0] // .........................................e..................................................................................................................................|....................................................................................................................................................e.................... + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ..................................e.........................................................................................................................................|.............................................................................................................................................e........................... + // mls v11.4s, v24.4s, v29.4s // ..........................................................e.................................................................................................................|.....................................................................................................................................................................e... + // sub v24.4s, v12.4s, v13.4s // ........................................e...................................................................................................................................|...................................................................................................................................................e..................... + // add v12.4s, v12.4s, v13.4s // ................................e...........................................................................................................................................|...........................................................................................................................................e............................. + // mul v13.4s, v24.4s, v4.s[2] // ......................................................e.....................................................................................................................|.................................................................................................................................................................e....... + // sqrdmulh v24.4s, v24.4s, v4.s[3] // .....................................................e......................................................................................................................|................................................................................................................................................................e........ + // mls v13.4s, v24.4s, v29.4s // ............................................................e...............................................................................................................|.......................................................................................................................................................................e. + // sub v24.4s, v14.4s, v15.4s // ...............................................................e............................................................................................................|......................................................................................................................................................................... + // add v14.4s, v14.4s, v15.4s // ..............................................................e.............................................................................................................|......................................................................................................................................................................... + // mul v15.4s, v24.4s, v5.s[0] // .......................................................................e....................................................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.s[1] // ........................................................................e...................................................................................................|......................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // .......................................................................................e....................................................................................|......................................................................................................................................................................... + // sub v24.4s, v16.4s, v17.4s // ....................................................................e.......................................................................................................|......................................................................................................................................................................... + // add v16.4s, v16.4s, v17.4s // .................................................................e..........................................................................................................|......................................................................................................................................................................... + // mul v17.4s, v24.4s, v5.s[2] // .........................................................................e..................................................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ..........................................................................e.................................................................................................|......................................................................................................................................................................... + // mls v17.4s, v24.4s, v29.4s // ........................................................................................e...................................................................................|......................................................................................................................................................................... + // sub v24.4s, v18.4s, v19.4s // ..................................................................e.........................................................................................................|......................................................................................................................................................................... + // add v18.4s, v18.4s, v19.4s // ...................................................................e........................................................................................................|......................................................................................................................................................................... + // mul v19.4s, v24.4s, v6.s[0] // .....................................................................................e......................................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.s[1] // ......................................................................................e.....................................................................................|......................................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ..............................................................................................e.............................................................................|......................................................................................................................................................................... + // sub v24.4s, v20.4s, v21.4s // .........................................................................................................e..................................................................|......................................................................................................................................................................... + // add v20.4s, v20.4s, v21.4s // ........................................................................................................e...................................................................|......................................................................................................................................................................... + // mul v21.4s, v24.4s, v6.s[2] // .................................................................................................................e..........................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ................................................................................................................e...........................................................|......................................................................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // .....................................................................................................................e......................................................|......................................................................................................................................................................... + // sub v24.4s, v22.4s, v23.4s // .............................................................................................e..............................................................................|......................................................................................................................................................................... + // add v22.4s, v22.4s, v23.4s // ...............................................................................................e............................................................................|......................................................................................................................................................................... + // mul v23.4s, v24.4s, v7.s[0] // ..................................................................................................e.........................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ...................................................................................................e........................................................................|......................................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // ..........................................................................................................e.................................................................|......................................................................................................................................................................... + // sub v24.4s, v8.4s, v10.4s // ...................................................e........................................................................................................................|..............................................................................................................................................................e.......... + // add v8.4s, v8.4s, v10.4s // .................................................e..........................................................................................................................|............................................................................................................................................................e............ + // mul v10.4s, v24.4s, v1.s[2] // ................................................................e...........................................................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .........................................................e..................................................................................................................|....................................................................................................................................................................e.... + // mls v10.4s, v24.4s, v29.4s // ............................................................................e...............................................................................................|......................................................................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ...........................................................................e................................................................................................|......................................................................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...............................................................................e............................................................................................|......................................................................................................................................................................... + // mul v11.4s, v24.4s, v1.s[2] // ..................................................................................................................................e.........................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................................................................................................................e........................................|......................................................................................................................................................................... + // mls v11.4s, v24.4s, v29.4s // ..........................................................................................................................................e.................................|......................................................................................................................................................................... + // sub v24.4s, v12.4s, v14.4s // ......................................................................e.....................................................................................................|......................................................................................................................................................................... + // add v12.4s, v12.4s, v14.4s // .....................................................................e......................................................................................................|......................................................................................................................................................................... + // mul v14.4s, v24.4s, v2.s[0] // ................................................................................e...........................................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..................................................................................e.........................................................................................|......................................................................................................................................................................... + // mls v14.4s, v24.4s, v29.4s // .........................................................................................e..................................................................................|......................................................................................................................................................................... + // sub v24.4s, v13.4s, v15.4s // .................................................................................................e..........................................................................|......................................................................................................................................................................... + // add v13.4s, v13.4s, v15.4s // ................................................................................................e...........................................................................|......................................................................................................................................................................... + // mul v15.4s, v24.4s, v2.s[0] // ...........................................................................................................e................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ............................................................................................................e...............................................................|......................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // ....................................................................................................................e.......................................................|......................................................................................................................................................................... + // sub v24.4s, v16.4s, v18.4s // ...................................................................................e........................................................................................|......................................................................................................................................................................... + // add v16.4s, v16.4s, v18.4s // ....................................................................................e.......................................................................................|......................................................................................................................................................................... + // mul v18.4s, v24.4s, v2.s[2] // ....................................................................................................e.......................................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .....................................................................................................e......................................................................|......................................................................................................................................................................... + // mls v18.4s, v24.4s, v29.4s // .............................................................................................................e..............................................................|......................................................................................................................................................................... + // sub v24.4s, v17.4s, v19.4s // .......................................................................................................e....................................................................|......................................................................................................................................................................... + // add v17.4s, v17.4s, v19.4s // ......................................................................................................e.....................................................................|......................................................................................................................................................................... + // mul v19.4s, v24.4s, v2.s[2] // .........................................................................................................................e..................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................................................................................................e...................................................|......................................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // .................................................................................................................................e..........................................|......................................................................................................................................................................... + // sub v24.4s, v20.4s, v22.4s // ...............................................................................................................e............................................................|......................................................................................................................................................................... + // add v20.4s, v20.4s, v22.4s // ..............................................................................................................e.............................................................|......................................................................................................................................................................... + // mul v22.4s, v24.4s, v3.s[0] // ............................................................................................................................e...............................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................................................e...........................................|......................................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ........................................................................................................................................e...................................|......................................................................................................................................................................... + // sub v24.4s, v21.4s, v23.4s // ..............................................................................................................................e.............................................|......................................................................................................................................................................... + // add v21.4s, v21.4s, v23.4s // ...............................................................................................................................e............................................|......................................................................................................................................................................... + // mul v23.4s, v24.4s, v3.s[0] // .......................................................................................................................................e....................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ......................................................................................................................................e.....................................|......................................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // .................................................................................................................................................e..........................|......................................................................................................................................................................... + // sub v24.4s, v8.4s, v12.4s // ...........................................................................................e................................................................................|......................................................................................................................................................................... + // add v8.4s, v8.4s, v12.4s // .................................................................................e..........................................................................................|......................................................................................................................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...........................................................................................................................................................e................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........................................................................................................................................................e.................|......................................................................................................................................................................... + // mls v12.4s, v24.4s, v29.4s // ....................................................................................................................................................................e.......|......................................................................................................................................................................... + // sub v24.4s, v9.4s, v13.4s // ...........................................................................................................................e................................................|......................................................................................................................................................................... + // add v9.4s, v9.4s, v13.4s // ..........................................................................................................................e.................................................|......................................................................................................................................................................... + // mul v13.4s, v24.4s, v0.s[2] // .............................................................................................................................................e..............................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................................e..................................|......................................................................................................................................................................... + // mls v13.4s, v24.4s, v29.4s // ................................................................................................................................................e...........................|......................................................................................................................................................................... + // sub v24.4s, v10.4s, v14.4s // ....................................................................................................................................e.......................................|......................................................................................................................................................................... + // add v10.4s, v10.4s, v14.4s // .....................................................................................................................................e......................................|......................................................................................................................................................................... + // mul v14.4s, v24.4s, v0.s[2] // ...............................................................................................................................................e............................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................................................................................e.............................|......................................................................................................................................................................... + // mls v14.4s, v24.4s, v29.4s // ....................................................................................................................................................e.......................|......................................................................................................................................................................... + // sub v24.4s, v11.4s, v15.4s // ........................................................................................................................................................e...................|......................................................................................................................................................................... + // add v11.4s, v11.4s, v15.4s // .......................................................................................................................................................e....................|......................................................................................................................................................................... + // mul v15.4s, v24.4s, v0.s[2] // .....................................................................................................................................................................e......|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ......................................................................................................................................................................e.....|......................................................................................................................................................................... + // mls v15.4s, v24.4s, v29.4s // ............................................................................................................................................................................*......................................................................................................................................................................... + // sub v24.4s, v16.4s, v20.4s // ..................................................................................................................e.........................................................|......................................................................................................................................................................... + // add v16.4s, v16.4s, v20.4s // ...................................................................................................................e........................................................|......................................................................................................................................................................... + // mul v20.4s, v24.4s, v1.s[0] // .......................................................................................................................e....................................................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................................................................................e.....................................................|......................................................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // .............................................................................................................................e..............................................|......................................................................................................................................................................... + // sub v24.4s, v17.4s, v21.4s // .............................................................................................................................................................e..............|......................................................................................................................................................................... + // add v17.4s, v17.4s, v21.4s // .........................................................................................................................................................e..................|......................................................................................................................................................................... + // mul v21.4s, v24.4s, v1.s[0] // ...........................................................................................................................................................................e|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................................................................................e..|......................................................................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................|.*....................................................................................................................................................................... + // sub v24.4s, v18.4s, v22.4s // ......................................................................................................................................................e.....................|......................................................................................................................................................................... + // add v18.4s, v18.4s, v22.4s // .....................................................................................................................................................e......................|......................................................................................................................................................................... + // mul v22.4s, v24.4s, v1.s[0] // ...............................................................................................................................................................e............|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................................................................e.............|......................................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ..........................................................................................................................................................................e.|......................................................................................................................................................................... + // sub v24.4s, v19.4s, v23.4s // ..................................................................................................................................................................e.........|......................................................................................................................................................................... + // add v19.4s, v19.4s, v23.4s // ...................................................................................................................................................................e........|......................................................................................................................................................................... + // mul v23.4s, v24.4s, v1.s[0] // ............................................................................................................................................................................|................*........................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................|.................*....................................................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................|............................*............................................................................................................................................ + // sub v24.4s, v8.4s, v16.4s // ............................................................................................................................................e...............................|......................................................................................................................................................................... + // add v8.4s, v8.4s, v16.4s // ...........................................................................................................................................e................................|......................................................................................................................................................................... + // mul v16.4s, v24.4s, v0.s[0] // ...................................................................................................................................................e........................|......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................................e.........................|......................................................................................................................................................................... + // mls v16.4s, v24.4s, v29.4s // ............................................................................................................................................................e...............|......................................................................................................................................................................... + // sub v24.4s, v9.4s, v17.4s // .................................................................................................................................................................e..........|......................................................................................................................................................................... + // add v9.4s, v9.4s, v17.4s // ................................................................................................................................................................e...........|......................................................................................................................................................................... + // mul v17.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|....................*.................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|.....................*................................................................................................................................................... + // mls v17.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...........................*............................................................................................................................................. + // sub v24.4s, v10.4s, v18.4s // .......................................................................................................................................................................e....|......................................................................................................................................................................... + // add v10.4s, v10.4s, v18.4s // ........................................................................................................................................................................e...|......................................................................................................................................................................... + // mul v18.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|..........*.............................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|............*............................................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // ............................................................................................................................................................................|..................*...................................................................................................................................................... + // sub v24.4s, v11.4s, v19.4s // ............................................................................................................................................................................|....*.................................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ............................................................................................................................................................................|...*..................................................................................................................................................................... + // mul v19.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|.............*........................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|..............*.......................................................................................................................................................... + // mls v19.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...................*..................................................................................................................................................... + // sub v24.4s, v12.4s, v20.4s // ............................................................................................................................................................................|.........................*............................................................................................................................................... + // add v12.4s, v12.4s, v20.4s // ............................................................................................................................................................................|..........................*.............................................................................................................................................. + // mul v20.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|..................................*...................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|...................................*..................................................................................................................................... + // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...........................................*............................................................................................................................. + // sub v24.4s, v13.4s, v21.4s // ............................................................................................................................................................................|.......................*................................................................................................................................................. + // add v13.4s, v13.4s, v21.4s // ............................................................................................................................................................................|........................*................................................................................................................................................ + // mul v21.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|................................................*........................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|...................................................*..................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................|.................................................................*....................................................................................................... + // sub v24.4s, v14.4s, v22.4s // ............................................................................................................................................................................|......*.................................................................................................................................................................. + // add v14.4s, v14.4s, v22.4s // ............................................................................................................................................................................|.....*................................................................................................................................................................... + // mul v22.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|........*................................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|.........*............................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................|...............*......................................................................................................................................................... + // sub v24.4s, v15.4s, v23.4s // ............................................................................................................................................................................|.....................................*................................................................................................................................... + // add v15.4s, v15.4s, v23.4s // ............................................................................................................................................................................|.......................................*................................................................................................................................. + // mul v23.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................|.............................................*........................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................|............................................*............................................................................................................................ + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................|.........................................................*............................................................................................................... + // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................|..*...................................................................................................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................|*........................................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.......*................................................................................................................................................................. + // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................|...........*............................................................................................................................................................. + // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................|.............................................................*........................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................|...............................................................*......................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.....................................................................*................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................|............................................................................*............................................................................................ + // cmge v27.4s, v31.4s, v18.4s // ............................................................................................................................................................................|.............................*........................................................................................................................................... + // cmge v28.4s, v18.4s, v30.4s // ............................................................................................................................................................................|..............................*.......................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.................................*....................................................................................................................................... + // mls v18.4s, v28.4s, v29.4s // ............................................................................................................................................................................|......................................*.................................................................................................................................. + // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................|................................*........................................................................................................................................ + // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................|...............................*......................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|....................................*.................................................................................................................................... + // mls v19.4s, v28.4s, v29.4s // ............................................................................................................................................................................|..........................................*.............................................................................................................................. + // cmge v27.4s, v31.4s, v20.4s // ............................................................................................................................................................................|..................................................................*...................................................................................................... + // cmge v28.4s, v20.4s, v30.4s // ............................................................................................................................................................................|...................................................................*..................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|........................................................................*................................................................................................ + // mls v20.4s, v28.4s, v29.4s // ............................................................................................................................................................................|.................................................................................*....................................................................................... + // cmge v27.4s, v31.4s, v21.4s // ..........*.................................................................................................................................................................|.....................................................................................................................*................................................... + // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................|.......................................................................................................*................................................................. + // sub v28.4s, v27.4s, v28.4s // ....................*.......................................................................................................................................................|...............................................................................................................................*......................................... + // mls v21.4s, v28.4s, v29.4s // ...........................*................................................................................................................................................|......................................................................................................................................*.................................. + // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................|........................................*................................................................................................................................ + // cmge v28.4s, v22.4s, v30.4s // ............................................................................................................................................................................|.........................................*............................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|..............................................*.......................................................................................................................... + // mls v22.4s, v28.4s, v29.4s // ............................................................................................................................................................................|......................................................*.................................................................................................................. + // cmge v27.4s, v31.4s, v23.4s // ............................................................................................................................................................................|......................................................................*.................................................................................................. + // cmge v28.4s, v23.4s, v30.4s // ............................................................................................................................................................................|.......................................................................*................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|...........................................................................*............................................................................................. + // mls v23.4s, v28.4s, v29.4s // ............................................................................................................................................................................|...................................................................................*..................................................................................... + // str q16, [x1, #(8*(512/8))] // ............................................................................................................................................................................|......................*.................................................................................................................................................. + // str q17, [x1, #(9*(512/8))] // ............................................................................................................................................................................|......................................................................................*.................................................................................. + // str q18, [x1, #(10*(512/8))] // ............................................................................................................................................................................|.................................................*....................................................................................................................... + // str q19, [x1, #(11*(512/8))] // ............................................................................................................................................................................|...........................................................*............................................................................................................. + // str q20, [x1, #(12*(512/8))] // ............................................................................................................................................................................|........................................................................................*................................................................................ + // str q21, [x1, #(13*(512/8))] // ...................................*........................................................................................................................................|..............................................................................................................................................*.......................... + // str q22, [x1, #(14*(512/8))] // ............................................................................................................................................................................|..........................................................*.............................................................................................................. + // str q23, [x1, #(15*(512/8))] // ............................................................................................................................................................................|...............................................................................................*......................................................................... + // mul v16.4s, v8.4s, v25.4s // ............................................................................................................................................................................|....................................................*.................................................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // ............................................................................................................................................................................|.....................................................*................................................................................................................... + // mls v16.4s, v8.4s, v29.4s // ............................................................................................................................................................................|.......................................................*................................................................................................................. + // mul v17.4s, v9.4s, v25.4s // ............................................................................................................................................................................|..........................................................................................*.............................................................................. + // sqrdmulh v9.4s, v9.4s, v26.4s // ............................................................................................................................................................................|............................................................................................*............................................................................ + // mls v17.4s, v9.4s, v29.4s // ............................................................................................................................................................................|.....................................................................................................*................................................................... + // mul v18.4s, v10.4s, v25.4s // ............................................................................................................................................................................|.............................................................................*........................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ............................................................................................................................................................................|..............................................................................*.......................................................................................... + // mls v18.4s, v10.4s, v29.4s // .*..........................................................................................................................................................................|............................................................................................................*............................................................ + // mul v19.4s, v11.4s, v25.4s // ............................................................................................................................................................................|............................................................*............................................................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // ............................................................................................................................................................................|........................................................*................................................................................................................ + // mls v19.4s, v11.4s, v29.4s // ............................................................................................................................................................................|....................................................................*.................................................................................................... + // mul v20.4s, v12.4s, v25.4s // ............................................................................................................................................................................|.............................................................................................*........................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ............................................................................................................................................................................|.........................................................................................*............................................................................... + // mls v20.4s, v12.4s, v29.4s // ............................................................................................................................................................................|....................................................................................................*.................................................................... + // mul v21.4s, v13.4s, v25.4s // ............................................................................................................................................................................|...............................................*......................................................................................................................... + // sqrdmulh v13.4s, v13.4s, v26.4s // ............................................................................................................................................................................|..................................................*...................................................................................................................... + // mls v21.4s, v13.4s, v29.4s // ............................................................................................................................................................................|..............................................................*.......................................................................................................... + // mul v22.4s, v14.4s, v25.4s // ............................................................................................................................................................................|................................................................*........................................................................................................ + // sqrdmulh v14.4s, v14.4s, v26.4s // ............................................................................................................................................................................|..........................................................................*.............................................................................................. + // mls v22.4s, v14.4s, v29.4s // ............................................................................................................................................................................|.......................................................................................*................................................................................. + // mul v23.4s, v15.4s, v25.4s // ............................................................................................................................................................................|................................................................................................*........................................................................ + // sqrdmulh v15.4s, v15.4s, v26.4s // ............................................................................................................................................................................|.................................................................................................*....................................................................... + // mls v23.4s, v15.4s, v29.4s // ............................................................................................................................................................................|..........................................................................................................*.............................................................. + // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................|.........................................................................*............................................................................................... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................|..................................................................................*...................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.....................................................................................*................................................................................... + // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................|...........................................................................................*............................................................................. + // cmge v27.4s, v31.4s, v17.4s // ...............*............................................................................................................................................................|..........................................................................................................................*.............................................. + // cmge v28.4s, v17.4s, v30.4s // .....................*......................................................................................................................................................|................................................................................................................................*........................................ + // sub v28.4s, v27.4s, v28.4s // .........................*..................................................................................................................................................|....................................................................................................................................*.................................... + // mls v17.4s, v28.4s, v29.4s // ................................................*...........................................................................................................................|...........................................................................................................................................................*............. + // cmge v27.4s, v31.4s, v18.4s // ......................................*.....................................................................................................................................|.................................................................................................................................................*....................... + // cmge v28.4s, v18.4s, v30.4s // ...............................*............................................................................................................................................|..........................................................................................................................................*.............................. + // sub v28.4s, v27.4s, v28.4s // ...............................................*............................................................................................................................|..........................................................................................................................................................*.............. + // mls v18.4s, v28.4s, v29.4s // ........................................................*...................................................................................................................|...................................................................................................................................................................*..... + // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................|................................................................................*........................................................................................ + // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................|...............................................................................*......................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|....................................................................................*.................................................................................... + // mls v19.4s, v28.4s, v29.4s // ......*.....................................................................................................................................................................|.................................................................................................................*....................................................... + // cmge v27.4s, v31.4s, v20.4s // ........*...................................................................................................................................................................|...................................................................................................................*..................................................... + // cmge v28.4s, v20.4s, v30.4s // .........*..................................................................................................................................................................|....................................................................................................................*.................................................... + // sub v28.4s, v27.4s, v28.4s // .................*..........................................................................................................................................................|............................................................................................................................*............................................ + // mls v20.4s, v28.4s, v29.4s // ..........................*.................................................................................................................................................|.....................................................................................................................................*................................... + // cmge v27.4s, v31.4s, v21.4s // ............................................................................................................................................................................|..............................................................................................*.......................................................................... + // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................|..................................................................................................*...................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................|.........................................................................................................*............................................................... + // mls v21.4s, v28.4s, v29.4s // ...................*........................................................................................................................................................|..............................................................................................................................*.......................................... + // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................|...................................................................................................*..................................................................... + // cmge v28.4s, v22.4s, v30.4s // ............................................................................................................................................................................|........................................................................................................*................................................................ + // sub v28.4s, v27.4s, v28.4s // ..*.........................................................................................................................................................................|.............................................................................................................*........................................................... + // mls v22.4s, v28.4s, v29.4s // ...........*................................................................................................................................................................|......................................................................................................................*.................................................. + // cmge v27.4s, v31.4s, v23.4s // .....*......................................................................................................................................................................|................................................................................................................*........................................................ + // cmge v28.4s, v23.4s, v30.4s // .......*....................................................................................................................................................................|..................................................................................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ..............*.............................................................................................................................................................|.........................................................................................................................*............................................... + // mls v23.4s, v28.4s, v29.4s // ........................*...................................................................................................................................................|...................................................................................................................................*..................................... + // str q16, [x1], #(16) // ............................................................................................................................................................................|......................................................................................................*.................................................................. + // str q17, [x1, #(-16 + 1*(512/8))] // ...........................................................*................................................................................................................|......................................................................................................................................................................*.. + // str q18, [x1, #(-16 + 2*(512/8))] // .............................................................*..............................................................................................................|........................................................................................................................................................................* + // str q19, [x1, #(-16 + 3*(512/8))] // ..................*.........................................................................................................................................................|.............................................................................................................................*........................................... + // str q20, [x1, #(-16 + 4*(512/8))] // ....................................*.......................................................................................................................................|...............................................................................................................................................*......................... + // str q21, [x1, #(-16 + 5*(512/8))] // ..............................*.............................................................................................................................................|.........................................................................................................................................*............................... + // str q22, [x1, #(-16 + 6*(512/8))] // ............................*...............................................................................................................................................|.......................................................................................................................................*................................. + // str q23, [x1, #(-16 + 7*(512/8))] // .......................................*....................................................................................................................................|..................................................................................................................................................*...................... + + sub count, count, #1 + cbnz count, layer1234_start + mls v24.4S, v22.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mls v27.4S, v21.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + cmge v22.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v21.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + add v11.4S, v14.4S, v23.4S // ........................................................................................................................................................*............................................................................................................................... + sub v23.4S, v14.4S, v23.4S // .......................................................................................................................................................*................................................................................................................................ + add v14.4S, v17.4S, v18.4S // .......................................................................................................................................................................*................................................................................................................ + sub v17.4S, v17.4S, v18.4S // ......................................................................................................................................................................*................................................................................................................. + mul v18.4S, v19.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + sub v21.4S, v21.4S, v22.4S // ..................................................................................................................................................................................*..................................................................................................... + sub v22.4S, v12.4S, v20.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v12.4S, v20.4S // .............................................................................................................................................................*.......................................................................................................................... + sqrdmulh v20.4S, v28.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mul v28.4S, v28.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mls v16.4S, v21.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + sub v21.4S, v13.4S, v27.4S // .................................................................................................................................................................*...................................................................................................................... + add v13.4S, v13.4S, v27.4S // ..................................................................................................................................................................*..................................................................................................................... + mls v18.4S, v19.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mul v19.4S, v23.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v23.4S, v23.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v28.4S, v20.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mul v20.4S, v22.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + sqrdmulh v27.4S, v22.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + mul v22.4S, v17.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + sqrdmulh v16.4S, v17.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v17.4S, v15.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + mls v22.4S, v16.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v16.4S, v18.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + mls v20.4S, v27.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + cmge v27.4S, v31.4S, v18.4S // ........................................................................................................................................................................................*............................................................................................... + mls v19.4S, v23.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + sqrdmulh v23.4S, v15.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + add v15.4S, v24.4S, v28.4S // ............................................................................................................................................................................*........................................................................................................... + sub v27.4S, v27.4S, v16.4S // ..........................................................................................................................................................................................*............................................................................................. + sub v16.4S, v24.4S, v28.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v28.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v24.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................*............................................................................... + mls v18.4S, v27.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + sqrdmulh v27.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mls v17.4S, v23.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sub v23.4S, v24.4S, v28.4S // ..........................................................................................................................................................................................................*............................................................................. + cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + mul v24.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + cmge v21.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + mls v22.4S, v23.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v23.4S, v16.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v16.4S, v16.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + sub v28.4S, v21.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + mls v24.4S, v27.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v21.4S, v31.4S, v20.4S // ................................................................................................................................................................................................*....................................................................................... + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................*................................................................................................... + str q18, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + cmge v18.4S, v20.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v19.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + mls v23.4S, v16.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sub v18.4S, v21.4S, v18.4S // ..................................................................................................................................................................................................*..................................................................................... + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + cmge v19.4S, v24.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v28.4S, v31.4S, v24.4S // ....................................................................................................................................................................................................*................................................................................... + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + str q22, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + cmge v16.4S, v17.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + mul v22.4S, v14.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v14.4S, v14.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + mls v20.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sub v18.4S, v28.4S, v19.4S // ......................................................................................................................................................................................................*................................................................................. + sqrdmulh v28.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v27.4S, v27.4S, v16.4S // ......................................................................................................................................................................................*................................................................................................. + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + cmge v19.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v8.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + mls v24.4S, v18.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + sub v8.4S, v19.4S, v8.4S // ..............................................................................................................................................................................................................*......................................................................... + str q20, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + mls v22.4S, v14.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v14.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v17.4S, v27.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + mls v16.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v23.4S, v8.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + cmge v8.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mul v18.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + str q24, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + sqrdmulh v12.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + mls v19.4S, v14.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + str q17, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + cmge v13.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sqrdmulh v28.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + cmge v10.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v11.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + cmge v9.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + sqrdmulh v27.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mul v23.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + mls v18.4S, v12.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + sub v10.4S, v10.4S, v13.4S // ..........................................................................................................................................................................................................................................................................*............. + cmge v24.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + mls v17.4S, v28.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sub v13.4S, v9.4S, v11.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v15.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + sub v11.4S, v8.4S, v24.4S // ..................................................................................................................................................................................................................................................*..................................... + mls v23.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v27.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + cmge v9.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + mls v22.4S, v10.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + mls v16.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + sub v10.4S, v9.4S, v27.4S // ..................................................................................................................................................................................................................................................................*..................... + cmge v27.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + cmge v13.4S, v18.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + cmge v24.4S, v31.4S, v18.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v8.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + mls v20.4S, v10.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + cmge v10.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v11.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v12.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................................................................................*........... + sub v27.4S, v15.4S, v27.4S // ..............................................................................................................................................................................................................................................................*......................... + str q22, [x1, #384] // ......................................................................................................................................................................................................................................................................................*. + str q21, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + sub v22.4S, v24.4S, v13.4S // ..........................................................................................................................................................................................................................................................*............................. + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v10.4S, v8.4S, v10.4S // ......................................................................................................................................................................................................................................................*................................. + sub v13.4S, v12.4S, v11.4S // ..............................................................................................................................................................................................................................................................................*......... + mls v19.4S, v27.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + mls v18.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + mls v23.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + mls v17.4S, v10.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + str q18, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + str q23, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + str q17, [x1, #48] // .................................................................................................................................................................................................................................................................................*...... + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_icestorm.s b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_icestorm.s new file mode 100644 index 00000000..d449be9f --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_1234_5678_opt_m1_icestorm.s @@ -0,0 +1,1764 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vsub d,a,b + sub \d\().4s, \a\().4s, \b\().4s +.endm +.macro vadd d,a,b + add \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmul d,a,b + mul \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro trn1_d d,a,b + trn1 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn2_d d,a,b + trn2 \d\().2d, \a\().2d, \b\().2d +.endm +.macro trn1_s d,a,b + trn1 \d\().4s, \a\().4s, \b\().4s +.endm +.macro trn2_s d,a,b + trn2 \d\().4s, \a\().4s, \b\().4s +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmls \dst, \src, modulus +.endm + +.macro mulmod dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, modulus +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + vsub tmp, \a, \b + vadd \a, \a, \b + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + vsub tmp, \a, \b + vadd \a, \a, \b + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src0, src1, src2, src3, src4, src5, src6, src7 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw + mulmod \dst4, \src4, ninv, ninv_tw + mulmod \dst5, \src5, ninv, ninv_tw + mulmod \dst6, \src6, ninv, ninv_tw + mulmod \dst7, \src7, ninv, ninv_tw +.endm + +.macro load_roots_1234 r_ptr + ldr_vi root0, \r_ptr, (8*16) + ldr_vo root1, \r_ptr, (-8*16 + 1*16) + ldr_vo root2, \r_ptr, (-8*16 + 2*16) + ldr_vo root3, \r_ptr, (-8*16 + 3*16) + ldr_vo root4, \r_ptr, (-8*16 + 4*16) + ldr_vo root5, \r_ptr, (-8*16 + 5*16) + ldr_vo root6, \r_ptr, (-8*16 + 6*16) + ldr_vo root7, \r_ptr, (-8*16 + 7*16) +.endm + +.macro load_next_roots_56 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 16 +.endm + +.macro load_next_roots_6 root0, r_ptr0 + ldr_vi \root0, \r_ptr0, 8 +.endm + +.macro load_next_roots_78 root0, root0_tw, root1, root1_tw, root2, root2_tw, r_ptr1 + ldr_vi \root0, \r_ptr1, (6*16) + ldr_vo \root0_tw, \r_ptr1, (-6*16 + 1*16) + ldr_vo \root1, \r_ptr1, (-6*16 + 2*16) + ldr_vo \root1_tw, \r_ptr1, (-6*16 + 3*16) + ldr_vo \root2, \r_ptr1, (-6*16 + 4*16) + ldr_vo \root2_tw, \r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1_s t0, \data\()0, \data\()1 + trn2_s t1, \data\()0, \data\()1 + trn1_s t2, \data\()2, \data\()3 + trn2_s t3, \data\()2, \data\()3 + + trn2_d \data\()2, t0, t2 + trn2_d \data\()3, t1, t3 + trn1_d \data\()0, t0, t2 + trn1_d \data\()1, t1, t3 +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_1234_5678_twiddles.s" +.text + + .global intt_dilithium_1234_5678_opt_m1_icestorm + .global _intt_dilithium_1234_5678_opt_m1_icestorm + +.p2align 4 +modulus_addr: .quad 8380417 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_1234_5678_opt_m1_icestorm: +_intt_dilithium_1234_5678_opt_m1_icestorm: + push_stack + + inp .req x0 + in .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + data8 .req v16 + data9 .req v17 + data10 .req v18 + data11 .req v19 + data12 .req v20 + data13 .req v21 + data14 .req v22 + data15 .req v23 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + qform_data8 .req q16 + qform_data9 .req q17 + qform_data10 .req q18 + qform_data11 .req q19 + qform_data12 .req q20 + qform_data13 .req q21 + qform_data14 .req q22 + qform_data15 .req q23 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + + tmp .req v24 + qform_tmp .req q24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots) + ASM_LOAD(r_ptr1, roots_l45) + + ASM_LOAD(xtmp, modulus_addr) + ld1r {modulus.4s}, [xtmp] + + save STACK0, inp + + mov count, #16 + + .p2align 2 + ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x0] // *........................................ + ldr q5, [x3, #32] // ..*...................................... + ldr q8, [x3], #(6*16) // .*....................................... + // gap // ......................................... + ldr q9, [x3, #-48] // ....*.................................... + ldr q0, [x3, #-80] // ...*..................................... + // gap // ......................................... + // gap // ......................................... + ldr q2, [x3, #-32] // .....*................................... + ldr q12, [x3, #-16] // ......*.................................. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sub v15.4S, v19.4S, v20.4S // .......*................................. + add v19.4S, v19.4S, v20.4S // ........*................................ + // gap // ......................................... + // gap // ......................................... + sub v13.4S, v21.4S, v22.4S // .........*............................... + add v18.4S, v21.4S, v22.4S // ..............*.......................... + // gap // ......................................... + // gap // ......................................... + mul v5.4S, v15.4S, v5.4S // ..........*.............................. + sqrdmulh v9.4S, v15.4S, v9.4S // ...........*............................. + // gap // ......................................... + // gap // ......................................... + mul v2.4S, v13.4S, v2.4S // ............*............................ + sqrdmulh v12.4S, v13.4S, v12.4S // .............*........................... + // gap // ......................................... + // gap // ......................................... + sub v15.4S, v19.4S, v18.4S // .................*....................... + add v19.4S, v19.4S, v18.4S // ..................*...................... + // gap // ......................................... + // gap // ......................................... + mls v5.4S, v9.4S, v29.4S // ...............*......................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v2.4S, v12.4S, v29.4S // ................*........................ + mul v9.4S, v15.4S, v8.4S // .....................*................... + // gap // ......................................... + // gap // ......................................... + sqrdmulh v12.4S, v15.4S, v0.4S // ......................*.................. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + sub v15.4S, v5.4S, v2.4S // ...................*..................... + add v5.4S, v5.4S, v2.4S // ....................*.................... + // gap // ......................................... + // gap // ......................................... + mls v9.4S, v12.4S, v29.4S // ...........................*............. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mul v8.4S, v15.4S, v8.4S // .......................*................. + sqrdmulh v0.4S, v15.4S, v0.4S // ........................*................ + // gap // ......................................... + // gap // ......................................... + trn1 v2.4S, v19.4S, v5.4S // .........................*............... + trn2 v19.4S, v19.4S, v5.4S // ............................*............ + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v8.4S, v0.4S, v29.4S // ..........................*.............. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn1 v5.4S, v9.4S, v8.4S // .............................*........... + trn2 v8.4S, v9.4S, v8.4S // ..............................*.......... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + trn2 v7.2D, v2.2D, v5.2D // ...............................*......... + trn2 v12.2D, v19.2D, v8.2D // ................................*........ + // gap // ......................................... + // gap // ......................................... + trn1 v2.2D, v2.2D, v5.2D // .................................*....... + trn1 v0.2D, v19.2D, v8.2D // ..................................*...... + // gap // ......................................... + // gap // ......................................... + add v8.4S, v7.4S, v12.4S // ...................................*..... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + add v9.4S, v2.4S, v0.4S // ....................................*.... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + add v19.4S, v9.4S, v8.4S // .....................................*... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + srshr v5.4S, v19.4S, #23 // ......................................*.. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + mls v19.4S, v5.4S, v29.4S // .......................................*. + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + str q19, [x0], #(16*4) // ........................................* + // gap // ......................................... + // gap // ......................................... + // gap // ......................................... + + // original source code + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x0] // *........................................ + // ldr q26, [x3], #(6*16) // ..*...................................... + // ldr q27, [x3, #-64] // .*....................................... + // ldr q3, [x3, #-80] // ....*.................................... + // ldr q17, [x3, #-48] // ...*..................................... + // ldr q18, [x3, #-32] // .....*................................... + // ldr q20, [x3, #-16] // ......*.................................. + // sub v22.4S, v13.4S, v14.4S // .......*................................. + // add v5.4S, v13.4S, v14.4S // ........*................................ + // sub v9.4S, v15.4S, v16.4S // .........*............................... + // mul v11.4S, v22.4S, v27.4S // ...........*............................. + // sqrdmulh v24.4S, v22.4S, v17.4S // ............*............................ + // mul v28.4S, v9.4S, v18.4S // .............*........................... + // sqrdmulh v21.4S, v9.4S, v20.4S // ..............*.......................... + // add v14.4S, v15.4S, v16.4S // ..........*.............................. + // mls v11.4S, v24.4S, v29.4S // .................*....................... + // mls v28.4S, v21.4S, v29.4S // ..................*...................... + // sub v6.4S, v5.4S, v14.4S // ...............*......................... + // add v14.4S, v5.4S, v14.4S // ................*........................ + // sub v7.4S, v11.4S, v28.4S // .....................*................... + // add v13.4S, v11.4S, v28.4S // ......................*.................. + // mul v0.4S, v6.4S, v26.4S // ...................*..................... + // sqrdmulh v23.4S, v6.4S, v3.4S // ....................*.................... + // mul v18.4S, v7.4S, v26.4S // ........................*................ + // sqrdmulh v20.4S, v7.4S, v3.4S // .........................*............... + // trn1 v26.4S, v14.4S, v13.4S // ..........................*.............. + // mls v18.4S, v20.4S, v29.4S // ............................*............ + // mls v0.4S, v23.4S, v29.4S // .......................*................. + // trn2 v25.4S, v14.4S, v13.4S // ...........................*............. + // trn1 v6.4S, v0.4S, v18.4S // .............................*........... + // trn2 v27.4S, v0.4S, v18.4S // ..............................*.......... + // trn2 v7.2D, v26.2D, v6.2D // ...............................*......... + // trn2 v12.2D, v25.2D, v27.2D // ................................*........ + // trn1 v2.2D, v26.2D, v6.2D // .................................*....... + // trn1 v0.2D, v25.2D, v27.2D // ..................................*...... + // add v8.4S, v7.4S, v12.4S // ...................................*..... + // add v9.4S, v2.4S, v0.4S // ....................................*.... + // add v22.4S, v9.4S, v8.4S // .....................................*... + // srshr v13.4S, v22.4S, #23 // ......................................*.. + // mls v22.4S, v13.4S, v29.4S // .......................................*. + // str q22, [x0], #(16*4) // ........................................* + + sub count, count, #1 +layer5678_start: + sub v19.4S, v9.4S, v8.4S // ...............................................*................. + ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x0] // e................................................................ + ldr q26, [x3], #(6*16) // .e............................................................... + ldr q27, [x3, #-64] // ...e............................................................. + ldr q3, [x3, #-80] // ..e.............................................................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q17, [x3, #-48] // ....e............................................................ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q18, [x3, #-32] // .....e........................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + ldr q20, [x3, #-16] // ......e.......................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v30.4S, v2.4S, v0.4S // .....................................*........................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v22.4S, v13.4S, v14.4S // .......e......................................................... + add v5.4S, v13.4S, v14.4S // ........e........................................................ + // gap // ................................................................. + // gap // ................................................................. + sub v9.4S, v15.4S, v16.4S // ............e.................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mul v11.4S, v22.4S, v27.4S // .........e....................................................... + sqrdmulh v24.4S, v22.4S, v17.4S // ..........e...................................................... + // gap // ................................................................. + // gap // ................................................................. + mul v28.4S, v9.4S, v18.4S // ..............e.................................................. + sqrdmulh v21.4S, v9.4S, v20.4S // ...............e................................................. + // gap // ................................................................. + // gap // ................................................................. + add v14.4S, v15.4S, v16.4S // .............e................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v11.4S, v24.4S, v29.4S // ...........e..................................................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v28.4S, v21.4S, v29.4S // ................e................................................ + sub v6.4S, v5.4S, v14.4S // .................e............................................... + // gap // ................................................................. + // gap // ................................................................. + add v14.4S, v5.4S, v14.4S // ..................e.............................................. + ldr q5, [x4], #8 // ...................................*............................. + ldr q10, [x4], #16 // ....................................*............................ + // gap // ................................................................. + sub v1.4S, v7.4S, v12.4S // ..........................................*...................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + sub v7.4S, v11.4S, v28.4S // ......................e.......................................... + add v13.4S, v11.4S, v28.4S // .......................e......................................... + // gap // ................................................................. + // gap // ................................................................. + mul v0.4S, v6.4S, v26.4S // ...................e............................................. + sqrdmulh v23.4S, v6.4S, v3.4S // ....................e............................................ + // gap // ................................................................. + // gap // ................................................................. + mul v18.4S, v7.4S, v26.4S // ........................e........................................ + sqrdmulh v20.4S, v7.4S, v3.4S // .........................e....................................... + // gap // ................................................................. + // gap // ................................................................. + sqrdmulh v24.4S, v30.4S, v10.S[1] // ........................................*........................ + trn1 v26.4S, v14.4S, v13.4S // ...........................e..................................... + // gap // ................................................................. + // gap // ................................................................. + mul v30.4S, v30.4S, v10.S[0] // .......................................*......................... + mul v31.4S, v1.4S, v10.S[2] // ............................................*.................... + // gap // ................................................................. + // gap // ................................................................. + mls v18.4S, v20.4S, v29.4S // ..........................e...................................... + mls v0.4S, v23.4S, v29.4S // .....................e........................................... + // gap // ................................................................. + // gap // ................................................................. + trn2 v25.4S, v14.4S, v13.4S // ............................e.................................... + sqrdmulh v16.4S, v1.4S, v10.S[3] // .............................................*................... + // gap // ................................................................. + // gap // ................................................................. + mls v30.4S, v24.4S, v29.4S // .........................................*....................... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn1 v6.4S, v0.4S, v18.4S // .............................e................................... + trn2 v27.4S, v0.4S, v18.4S // ..............................e.................................. + // gap // ................................................................. + // gap // ................................................................. + mls v31.4S, v16.4S, v29.4S // ..............................................*.................. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + trn2 v7.2D, v26.2D, v6.2D // ...............................e................................. + trn2 v12.2D, v25.2D, v27.2D // ................................e................................ + // gap // ................................................................. + // gap // ................................................................. + trn1 v2.2D, v26.2D, v6.2D // .................................e............................... + trn1 v0.2D, v25.2D, v27.2D // ..................................e.............................. + // gap // ................................................................. + // gap // ................................................................. + sub v20.4S, v30.4S, v31.4S // ....................................................*............ + add v8.4S, v7.4S, v12.4S // ...........................................e..................... + // gap // ................................................................. + // gap // ................................................................. + add v14.4S, v30.4S, v31.4S // .....................................................*........... + add v9.4S, v2.4S, v0.4S // ......................................e.......................... + // gap // ................................................................. + // gap // ................................................................. + mul v18.4S, v19.4S, v5.S[0] // .................................................*............... + sqrdmulh v19.4S, v19.4S, v5.S[1] // ..................................................*.............. + // gap // ................................................................. + // gap // ................................................................. + srshr v30.4S, v14.4S, #23 // ...........................................................*..... + add v22.4S, v9.4S, v8.4S // ................................................e................ + // gap // ................................................................. + // gap // ................................................................. + mul v3.4S, v20.4S, v5.S[0] // ......................................................*.......... + sqrdmulh v26.4S, v20.4S, v5.S[1] // .......................................................*......... + // gap // ................................................................. + // gap // ................................................................. + mls v18.4S, v19.4S, v29.4S // ...................................................*............. + srshr v13.4S, v22.4S, #23 // .........................................................e....... + // gap // ................................................................. + // gap // ................................................................. + mls v14.4S, v30.4S, v29.4S // ............................................................*.... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v3.4S, v26.4S, v29.4S // ........................................................*........ + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + mls v22.4S, v13.4S, v29.4S // ..........................................................e...... + str q18, [x0, #-32] // ...............................................................*. + // gap // ................................................................. + // gap // ................................................................. + str q14, [x0, #-48] // ..............................................................*.. + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q3, [x0, #-16] // ................................................................* + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + str q22, [x0], #(16*4) // .............................................................e... + // gap // ................................................................. + // gap // ................................................................. + // gap // ................................................................. + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0] // e...............................................................|e.............................................................. + // ldr q0, [x3], #(6*16) // .e..............................................................|.e............................................................. + // ldr q4, [x3, #(-6*16 + 1*16)] // ...e............................................................|...e........................................................... + // ldr q1, [x3, #(-6*16 + 2*16)] // ..e.............................................................|..e............................................................ + // ldr q5, [x3, #(-6*16 + 3*16)] // ....e...........................................................|....e.......................................................... + // ldr q2, [x3, #(-6*16 + 4*16)] // .....e..........................................................|.....e......................................................... + // ldr q6, [x3, #(-6*16 + 5*16)] // ......e.........................................................|......e........................................................ + // sub v24.4s, v8.4s, v9.4s // ........e.......................................................|........e...................................................... + // add v8.4s, v8.4s, v9.4s // .........e......................................................|.........e..................................................... + // mul v9.4s, v24.4s, v1.4s // ...........e....................................................|...........e................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ............e...................................................|............e.................................................. + // mls v9.4s, v24.4s, v29.4s // ................e...............................................|................e.............................................. + // sub v24.4s, v10.4s, v11.4s // ..........e.....................................................|..........e.................................................... + // add v10.4s, v10.4s, v11.4s // ...............e................................................|...............e............................................... + // mul v11.4s, v24.4s, v2.4s // .............e..................................................|.............e................................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ..............e.................................................|..............e................................................ + // mls v11.4s, v24.4s, v29.4s // .................e..............................................|.................e............................................. + // sub v24.4s, v8.4s, v10.4s // ..................e.............................................|..................e............................................ + // add v8.4s, v8.4s, v10.4s // ...................e............................................|...................e........................................... + // mul v10.4s, v24.4s, v0.4s // .........................e......................................|.........................e..................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................e.....................................|..........................e.................................... + // mls v10.4s, v24.4s, v29.4s // ..................................e.............................|..................................e............................ + // sub v24.4s, v9.4s, v11.4s // .......................e........................................|.......................e....................................... + // add v9.4s, v9.4s, v11.4s // ........................e.......................................|........................e...................................... + // mul v11.4s, v24.4s, v0.4s // ...........................e....................................|...........................e................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ............................e...................................|............................e.................................. + // mls v11.4s, v24.4s, v29.4s // .................................e..............................|.................................e............................. + // trn1 v25.4s, v8.4s, v9.4s // ..............................e.................................|..............................e................................ + // trn2 v26.4s, v8.4s, v9.4s // ...................................e............................|...................................e........................... + // trn1 v27.4s, v10.4s, v11.4s // ......................................e.........................|......................................e........................ + // trn2 v28.4s, v10.4s, v11.4s // .......................................e........................|.......................................e....................... + // trn2 v10.2d, v25.2d, v27.2d // .........................................e......................|.........................................e..................... + // trn2 v11.2d, v26.2d, v28.2d // ..........................................e.....................|..........................................e.................... + // trn1 v8.2d, v25.2d, v27.2d // ...........................................e....................|...........................................e................... + // trn1 v9.2d, v26.2d, v28.2d // ............................................e...................|............................................e.................. + // ldr q1, [x4], #8 // ....................*...........................................|....................*.......................................... + // ldr q0, [x4], #16 // .....................*..........................................|.....................*......................................... + // sub v24.4s, v8.4s, v9.4s // .......*........................................................|.......*....................................................... + // add v8.4s, v8.4s, v9.4s // ................................................e...............|................................................e.............. + // mul v9.4s, v24.4s, v0.s[0] // ...............................*................................|...............................*............................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................*..................................|.............................*................................. + // mls v9.4s, v24.4s, v29.4s // .....................................*..........................|.....................................*......................... + // sub v24.4s, v10.4s, v11.4s // ......................*.........................................|......................*........................................ + // add v10.4s, v10.4s, v11.4s // ..............................................e.................|..............................................e................ + // mul v11.4s, v24.4s, v0.s[2] // ................................*...............................|................................*.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ....................................*...........................|....................................*.......................... + // mls v11.4s, v24.4s, v29.4s // ........................................*.......................|........................................*...................... + // sub v24.4s, v8.4s, v10.4s // ................................................................*............................................................... + // add v8.4s, v8.4s, v10.4s // ....................................................e...........|....................................................e.......... + // mul v10.4s, v24.4s, v1.s[0] // .................................................*..............|.................................................*............. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..................................................*.............|..................................................*............ + // mls v10.4s, v24.4s, v29.4s // .......................................................*........|.......................................................*....... + // sub v24.4s, v9.4s, v11.4s // .............................................*..................|.............................................*................. + // add v9.4s, v9.4s, v11.4s // ...............................................*................|...............................................*............... + // mul v11.4s, v24.4s, v1.s[0] // .....................................................*..........|.....................................................*......... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................*.........|......................................................*........ + // mls v11.4s, v24.4s, v29.4s // ..........................................................*.....|..........................................................*.... + // srshr v24.4S, v8.4S, #23 // ........................................................e.......|........................................................e...... + // mls v8.4s, v24.4s, v29.4s // ...........................................................e....|...........................................................e... + // srshr v24.4S, v9.4S, #23 // ...................................................*............|...................................................*........... + // mls v9.4s, v24.4s, v29.4s // .........................................................*......|.........................................................*..... + // str q8, [x0], #(16*4) // ...............................................................e|............................................................... + // str q9, [x0, #(-16*4 + 1*16)] // .............................................................*..|.............................................................*. + // str q10, [x0, #(-16*4 + 2*16)] // ............................................................*...|............................................................*.. + // str q11, [x0, #(-16*4 + 3*16)] // ..............................................................*.|..............................................................* + + sub count, count, #1 + cbnz count, layer5678_start + sub v13.4S, v9.4S, v8.4S // *....................... + sub v8.4S, v2.4S, v0.4S // .*...................... + ldr q2, [x4], #8 // ..*..................... + ldr q5, [x4], #16 // ...*.................... + sub v19.4S, v7.4S, v12.4S // ....*................... + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + sqrdmulh v0.4S, v8.4S, v5.S[1] // .....*.................. + mul v9.4S, v8.4S, v5.S[0] // ......*................. + // gap // ........................ + // gap // ........................ + mul v8.4S, v19.4S, v5.S[2] // .......*................ + sqrdmulh v5.4S, v19.4S, v5.S[3] // ........*............... + // gap // ........................ + // gap // ........................ + mul v16.4S, v13.4S, v2.S[0] // .............*.......... + // gap // ........................ + // gap // ........................ + // gap // ........................ + mls v9.4S, v0.4S, v29.4S // .........*.............. + sqrdmulh v19.4S, v13.4S, v2.S[1] // ..............*......... + // gap // ........................ + // gap // ........................ + mls v8.4S, v5.4S, v29.4S // ..........*............. + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + mls v16.4S, v19.4S, v29.4S // ..................*..... + // gap // ........................ + // gap // ........................ + // gap // ........................ + sub v5.4S, v9.4S, v8.4S // ...........*............ + // gap // ........................ + // gap // ........................ + // gap // ........................ + add v9.4S, v9.4S, v8.4S // ............*........... + // gap // ........................ + // gap // ........................ + // gap // ........................ + sqrdmulh v19.4S, v5.4S, v2.S[1] // .................*...... + mul v5.4S, v5.4S, v2.S[0] // ................*....... + str q16, [x0, #-32] // .....................*.. + // gap // ........................ + srshr v8.4S, v9.4S, #23 // ...............*........ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + mls v5.4S, v19.4S, v29.4S // ....................*... + // gap // ........................ + // gap // ........................ + // gap // ........................ + mls v9.4S, v8.4S, v29.4S // ...................*.... + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + // gap // ........................ + str q5, [x0, #-16] // .......................* + // gap // ........................ + // gap // ........................ + // gap // ........................ + str q9, [x0, #-48] // ......................*. + // gap // ........................ + // gap // ........................ + // gap // ........................ + + // original source code + // sub v19.4S, v9.4S, v8.4S // *....................... + // sub v30.4S, v2.4S, v0.4S // .*...................... + // ldr q5, [x4], #8 // ..*..................... + // ldr q10, [x4], #16 // ...*.................... + // sub v1.4S, v7.4S, v12.4S // ....*................... + // sqrdmulh v24.4S, v30.4S, v10.S[1] // .....*.................. + // mul v30.4S, v30.4S, v10.S[0] // ......*................. + // mul v31.4S, v1.4S, v10.S[2] // .......*................ + // sqrdmulh v16.4S, v1.4S, v10.S[3] // ........*............... + // mls v30.4S, v24.4S, v29.4S // ..........*............. + // mls v31.4S, v16.4S, v29.4S // ............*........... + // sub v20.4S, v30.4S, v31.4S // ..............*......... + // add v14.4S, v30.4S, v31.4S // ...............*........ + // mul v18.4S, v19.4S, v5.S[0] // .........*.............. + // sqrdmulh v19.4S, v19.4S, v5.S[1] // ...........*............ + // srshr v30.4S, v14.4S, #23 // ...................*.... + // mul v3.4S, v20.4S, v5.S[0] // .................*...... + // sqrdmulh v26.4S, v20.4S, v5.S[1] // ................*....... + // mls v18.4S, v19.4S, v29.4S // .............*.......... + // mls v14.4S, v30.4S, v29.4S // .....................*.. + // mls v3.4S, v26.4S, v29.4S // ....................*... + // str q18, [x0, #-32] // ..................*..... + // str q14, [x0, #-48] // .......................* + // str q3, [x0, #-16] // ......................*. + + + .unreq root0_tw + .unreq root1_tw + .unreq root2_tw + .unreq root3_tw + .unreq qform_root0_tw + .unreq qform_root1_tw + .unreq qform_root2_tw + .unreq qform_root3_tw + .unreq t0 + .unreq t1 + + root4 .req v4 + root5 .req v5 + root6 .req v6 + root7 .req v7 + qform_root4 .req q4 + qform_root5 .req q5 + qform_root6 .req q6 + qform_root7 .req q7 + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + + restore in, STACK0 + mov count, #4 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + load_roots_1234 r_ptr1 + + .p2align 2 + ldr q14, [x1, #320] // .....*.................................................................................................................................................................................................................................................................................. + ldr q28, [x1, #256] // ....*................................................................................................................................................................................................................................................................................... + ldr q22, [x1, #512] // ........*............................................................................................................................................................................................................................................................................... + ldr q19, [x1, #576] // .........*.............................................................................................................................................................................................................................................................................. + ldr q27, [x1, #384] // ......*................................................................................................................................................................................................................................................................................. + ldr q12, [x1, #960] // ...............*........................................................................................................................................................................................................................................................................ + add v17.4S, v28.4S, v14.4S // ...........................*............................................................................................................................................................................................................................................................ + sub v9.4S, v28.4S, v14.4S // ..........................*............................................................................................................................................................................................................................................................. + ldr q23, [x1, #896] // ..............*......................................................................................................................................................................................................................................................................... + sub v8.4S, v22.4S, v19.4S // ....................................*................................................................................................................................................................................................................................................... + ldr q15, [x1, #64] // .*...................................................................................................................................................................................................................................................................................... + add v21.4S, v22.4S, v19.4S // .....................................*.................................................................................................................................................................................................................................................. + ldr q20, [x1, #0] // *....................................................................................................................................................................................................................................................................................... + mul v24.4S, v9.4S, v4.S[2] // ............................*........................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v9.4S, v4.S[3] // .............................*.......................................................................................................................................................................................................................................................... + mul v9.4S, v8.4S, v5.S[2] // ......................................*................................................................................................................................................................................................................................................. + sqrdmulh v28.4S, v8.4S, v5.S[3] // .......................................*................................................................................................................................................................................................................................................ + ldr q10, [x1, #192] // ...*.................................................................................................................................................................................................................................................................................... + sub v16.4S, v23.4S, v12.4S // ...................................................*.................................................................................................................................................................................................................................... + add v22.4S, v23.4S, v12.4S // ....................................................*................................................................................................................................................................................................................................... + ldr q19, [x1, #128] // ..*..................................................................................................................................................................................................................................................................................... + mls v24.4S, v14.4S, v29.4S // ..............................*......................................................................................................................................................................................................................................................... + ldr q14, [x1, #448] // .......*................................................................................................................................................................................................................................................................................ + sub v11.4S, v20.4S, v15.4S // ................*....................................................................................................................................................................................................................................................................... + sqrdmulh v8.4S, v16.4S, v7.S[1] // ......................................................*................................................................................................................................................................................................................................. + mul v23.4S, v16.4S, v7.S[0] // .....................................................*.................................................................................................................................................................................................................................. + add v15.4S, v20.4S, v15.4S // .................*...................................................................................................................................................................................................................................................................... + ldr q12, [x1, #704] // ...........*............................................................................................................................................................................................................................................................................ + add v13.4S, v27.4S, v14.4S // ................................*....................................................................................................................................................................................................................................................... + sub v14.4S, v27.4S, v14.4S // ...............................*........................................................................................................................................................................................................................................................ + sqrdmulh v20.4S, v11.4S, v3.S[3] // ...................*.................................................................................................................................................................................................................................................................... + mul v11.4S, v11.4S, v3.S[2] // ..................*..................................................................................................................................................................................................................................................................... + mul v16.4S, v14.4S, v5.S[0] // .................................*...................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v14.4S, v5.S[1] // ..................................*..................................................................................................................................................................................................................................................... + sub v18.4S, v17.4S, v13.4S // ..................................................................*..................................................................................................................................................................................................................... + add v17.4S, v17.4S, v13.4S // ...................................................................*.................................................................................................................................................................................................................... + mls v11.4S, v20.4S, v29.4S // ....................*................................................................................................................................................................................................................................................................... + add v13.4S, v19.4S, v10.4S // ......................*................................................................................................................................................................................................................................................................. + mls v16.4S, v14.4S, v29.4S // ...................................*.................................................................................................................................................................................................................................................... + sqrdmulh v14.4S, v18.4S, v2.S[1] // .....................................................................*.................................................................................................................................................................................................................. + sub v20.4S, v15.4S, v13.4S // ........................................................*............................................................................................................................................................................................................................... + mul v18.4S, v18.4S, v2.S[0] // ....................................................................*................................................................................................................................................................................................................... + mls v23.4S, v8.4S, v29.4S // .......................................................*................................................................................................................................................................................................................................ + add v27.4S, v15.4S, v13.4S // .........................................................*.............................................................................................................................................................................................................................. + ldr q13, [x1, #640] // ..........*............................................................................................................................................................................................................................................................................. + mul v8.4S, v20.4S, v1.S[2] // ..........................................................*............................................................................................................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v1.S[3] // ...........................................................*............................................................................................................................................................................................................................ + mls v18.4S, v14.4S, v29.4S // ......................................................................*................................................................................................................................................................................................................. + add v15.4S, v24.4S, v16.4S // ........................................................................*............................................................................................................................................................................................................... + mls v9.4S, v28.4S, v29.4S // ........................................*............................................................................................................................................................................................................................................... + sub v19.4S, v19.4S, v10.4S // .....................*.................................................................................................................................................................................................................................................................. + mls v8.4S, v20.4S, v29.4S // ............................................................*........................................................................................................................................................................................................................... + sub v10.4S, v13.4S, v12.4S // .........................................*.............................................................................................................................................................................................................................................. + mul v28.4S, v19.4S, v4.S[0] // .......................*................................................................................................................................................................................................................................................................ + sqrdmulh v19.4S, v19.4S, v4.S[1] // ........................*............................................................................................................................................................................................................................................................... + add v12.4S, v13.4S, v12.4S // ..........................................*............................................................................................................................................................................................................................................. + sqrdmulh v13.4S, v10.4S, v6.S[1] // ............................................*........................................................................................................................................................................................................................................... + sub v20.4S, v8.4S, v18.4S // ..........................................................................................................*............................................................................................................................................................................. + mls v28.4S, v19.4S, v29.4S // .........................*.............................................................................................................................................................................................................................................................. + sub v24.4S, v24.4S, v16.4S // .......................................................................*................................................................................................................................................................................................................ + mul v14.4S, v20.4S, v0.S[2] // ............................................................................................................*........................................................................................................................................................................... + mul v19.4S, v10.4S, v6.S[0] // ...........................................*............................................................................................................................................................................................................................................ + sqrdmulh v16.4S, v20.4S, v0.S[3] // .............................................................................................................*.......................................................................................................................................................................... + ldr q20, [x1, #832] // .............*.......................................................................................................................................................................................................................................................................... + add v10.4S, v8.4S, v18.4S // ...........................................................................................................*............................................................................................................................................................................ + sub v18.4S, v21.4S, v12.4S // ............................................................................*........................................................................................................................................................................................................... + sub v8.4S, v11.4S, v28.4S // .............................................................*.......................................................................................................................................................................................................................... + add v28.4S, v11.4S, v28.4S // ..............................................................*......................................................................................................................................................................................................................... + mls v19.4S, v13.4S, v29.4S // .............................................*.......................................................................................................................................................................................................................................... + ldr q13, [x1, #768] // ............*........................................................................................................................................................................................................................................................................... + mls v14.4S, v16.4S, v29.4S // ..............................................................................................................*......................................................................................................................................................................... + add v16.4S, v21.4S, v12.4S // .............................................................................*.......................................................................................................................................................................................................... + mul v11.4S, v8.4S, v1.S[2] // ...............................................................*........................................................................................................................................................................................................................ + sqrdmulh v8.4S, v8.4S, v1.S[3] // ................................................................*....................................................................................................................................................................................................................... + sqrdmulh v21.4S, v18.4S, v2.S[3] // ...............................................................................*........................................................................................................................................................................................................ + mul v18.4S, v18.4S, v2.S[2] // ..............................................................................*......................................................................................................................................................................................................... + add v12.4S, v13.4S, v20.4S // ...............................................*........................................................................................................................................................................................................................................ + sub v20.4S, v13.4S, v20.4S // ..............................................*......................................................................................................................................................................................................................................... + mls v11.4S, v8.4S, v29.4S // .................................................................*...................................................................................................................................................................................................................... + add v8.4S, v27.4S, v17.4S // .................................................................................................*...................................................................................................................................................................................... + mls v18.4S, v21.4S, v29.4S // ................................................................................*....................................................................................................................................................................................................... + sub v21.4S, v12.4S, v22.4S // ......................................................................................*................................................................................................................................................................................................. + sub v17.4S, v27.4S, v17.4S // ................................................................................................*....................................................................................................................................................................................... + add v12.4S, v12.4S, v22.4S // .......................................................................................*................................................................................................................................................................................................ + sqrdmulh v22.4S, v24.4S, v2.S[1] // ..........................................................................*............................................................................................................................................................................................................. + mul v13.4S, v24.4S, v2.S[0] // .........................................................................*.............................................................................................................................................................................................................. + sub v24.4S, v28.4S, v15.4S // .....................................................................................................*.................................................................................................................................................................................. + add v27.4S, v16.4S, v12.4S // .....................................................................................................................*.................................................................................................................................................................. + add v15.4S, v28.4S, v15.4S // ......................................................................................................*................................................................................................................................................................................. + sub v28.4S, v9.4S, v19.4S // .................................................................................*...................................................................................................................................................................................................... + add v9.4S, v9.4S, v19.4S // ..................................................................................*..................................................................................................................................................................................................... + mls v13.4S, v22.4S, v29.4S // ...........................................................................*............................................................................................................................................................................................................ + mul v19.4S, v21.4S, v3.S[0] // ........................................................................................*............................................................................................................................................................................................... + sub v12.4S, v16.4S, v12.4S // ....................................................................................................................*................................................................................................................................................................... + sqrdmulh v22.4S, v21.4S, v3.S[1] // .........................................................................................*.............................................................................................................................................................................................. + sqrdmulh v16.4S, v20.4S, v6.S[3] // .................................................*...................................................................................................................................................................................................................................... + add v21.4S, v11.4S, v13.4S // ................................................................................................................*....................................................................................................................................................................... + sub v13.4S, v11.4S, v13.4S // ...............................................................................................................*........................................................................................................................................................................ + mul v11.4S, v20.4S, v6.S[2] // ................................................*....................................................................................................................................................................................................................................... + mul v20.4S, v28.4S, v2.S[2] // ...................................................................................*.................................................................................................................................................................................................... + mls v19.4S, v22.4S, v29.4S // ..........................................................................................*............................................................................................................................................................................................. + sqrdmulh v28.4S, v28.4S, v2.S[3] // ....................................................................................*................................................................................................................................................................................................... + sub v22.4S, v8.4S, v27.4S // ........................................................................................................................................*............................................................................................................................................... + add v8.4S, v8.4S, v27.4S // .........................................................................................................................................*.............................................................................................................................................. + mls v11.4S, v16.4S, v29.4S // ..................................................*..................................................................................................................................................................................................................................... + mul v16.4S, v13.4S, v0.S[2] // .................................................................................................................*...................................................................................................................................................................... + mls v20.4S, v28.4S, v29.4S // .....................................................................................*.................................................................................................................................................................................................. + sqrdmulh v28.4S, v13.4S, v0.S[3] // ..................................................................................................................*..................................................................................................................................................................... + sqrdmulh v13.4S, v12.4S, v1.S[1] // .......................................................................................................................*................................................................................................................................................................ + mul v12.4S, v12.4S, v1.S[0] // ......................................................................................................................*................................................................................................................................................................. + add v27.4S, v11.4S, v23.4S // ............................................................................................*........................................................................................................................................................................................... + sub v23.4S, v11.4S, v23.4S // ...........................................................................................*............................................................................................................................................................................................ + mls v12.4S, v13.4S, v29.4S // ........................................................................................................................*............................................................................................................................................................... + add v13.4S, v9.4S, v27.4S // ..........................................................................................................................*............................................................................................................................................................. + sub v27.4S, v9.4S, v27.4S // .........................................................................................................................*.............................................................................................................................................................. + mul v11.4S, v23.4S, v3.S[0] // .............................................................................................*.......................................................................................................................................................................................... + add v9.4S, v15.4S, v13.4S // ..............................................................................................................................................*......................................................................................................................................... + sub v13.4S, v15.4S, v13.4S // .............................................................................................................................................*.......................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v3.S[1] // ..............................................................................................*......................................................................................................................................................................................... + mul v15.4S, v27.4S, v1.S[0] // ...........................................................................................................................*............................................................................................................................................................ + sqrdmulh v27.4S, v27.4S, v1.S[1] // ............................................................................................................................*........................................................................................................................................................... + mls v16.4S, v28.4S, v29.4S // ...................................................................................................................*.................................................................................................................................................................... + mul v28.4S, v17.4S, v0.S[2] // ..................................................................................................*..................................................................................................................................................................................... + sqrdmulh v17.4S, v17.4S, v0.S[3] // ...................................................................................................*.................................................................................................................................................................................... + mls v11.4S, v23.4S, v29.4S // ...............................................................................................*........................................................................................................................................................................................ + sqrdmulh v23.4S, v24.4S, v0.S[3] // ........................................................................................................*............................................................................................................................................................................... + mul v24.4S, v24.4S, v0.S[2] // .......................................................................................................*................................................................................................................................................................................ + mls v15.4S, v27.4S, v29.4S // .............................................................................................................................*.......................................................................................................................................................... + mls v28.4S, v17.4S, v29.4S // ....................................................................................................*................................................................................................................................................................................... + mul v27.4S, v13.4S, v0.S[0] // ...............................................................................................................................................*........................................................................................................................................ + add v17.4S, v20.4S, v11.4S // ....................................................................................................................................*................................................................................................................................................... + sqrdmulh v13.4S, v13.4S, v0.S[1] // ................................................................................................................................................*....................................................................................................................................... + sub v11.4S, v20.4S, v11.4S // ...................................................................................................................................*.................................................................................................................................................... + mls v24.4S, v23.4S, v29.4S // .........................................................................................................*.............................................................................................................................................................................. + sub v23.4S, v28.4S, v12.4S // ............................................................................................................................................................*........................................................................................................................... + add v12.4S, v28.4S, v12.4S // .............................................................................................................................................................*.......................................................................................................................... + sqrdmulh v28.4S, v11.4S, v1.S[1] // ......................................................................................................................................*................................................................................................................................................. + mul v20.4S, v11.4S, v1.S[0] // .....................................................................................................................................*.................................................................................................................................................. + mls v27.4S, v13.4S, v29.4S // .................................................................................................................................................*...................................................................................................................................... + sqrdmulh v13.4S, v23.4S, v0.S[1] // ...............................................................................................................................................................*........................................................................................................................ + sub count, count, #1 +layer1234_start: + mls v20.4S, v28.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mul v23.4S, v23.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + add v11.4S, v21.4S, v17.4S // ........................................................................................................................................................*............................................................................................................................... + sub v28.4S, v21.4S, v17.4S // .......................................................................................................................................................*................................................................................................................................ + sub v17.4S, v18.4S, v19.4S // ..............................................................................................................................*......................................................................................................................................................... + sub v21.4S, v24.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... + add v19.4S, v18.4S, v19.4S // ...............................................................................................................................*........................................................................................................................................................ + mls v23.4S, v13.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sqrdmulh v18.4S, v21.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + mul v21.4S, v21.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + add v13.4S, v24.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... + add v15.4S, v16.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... + sub v20.4S, v16.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ + cmge v24.4S, v23.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v21.4S, v18.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + cmge v18.4S, v31.4S, v23.4S // ................................................................................................................................................................................................*....................................................................................... + mul v16.4S, v22.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sub v18.4S, v18.4S, v24.4S // ..................................................................................................................................................................................................*..................................................................................... + sqrdmulh v24.4S, v22.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + mul v22.4S, v28.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v28.4S, v28.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + mls v23.4S, v18.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + mul v18.4S, v17.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + mls v16.4S, v24.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + cmge v24.4S, v31.4S, v27.4S // ....................................................................................................................................................................................*................................................................................................... + mls v22.4S, v28.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + cmge v28.4S, v27.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + str q23, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + sub v24.4S, v24.4S, v28.4S // ......................................................................................................................................................................................*................................................................................................. + sqrdmulh v17.4S, v17.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + cmge v28.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v23.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................*................................................................................... + mls v27.4S, v24.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + cmge v24.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + sub v23.4S, v23.4S, v28.4S // ......................................................................................................................................................................................................*................................................................................. + mls v18.4S, v17.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + sqrdmulh v9.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mls v21.4S, v23.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + cmge v28.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + str q27, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + sub v23.4S, v14.4S, v18.4S // ......................................................................................................................................................................*................................................................................................................. + add v27.4S, v14.4S, v18.4S // .......................................................................................................................................................................*................................................................................................................ + mls v17.4S, v9.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + sub v14.4S, v24.4S, v28.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v28.4S, v23.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + sub v24.4S, v10.4S, v19.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v19.4S // ...................................................................................................................................................*.................................................................................................................................... + mls v16.4S, v14.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v9.4S, v24.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v18.4S, v24.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + mul v24.4S, v23.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mul v19.4S, v20.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + cmge v14.4S, v31.4S, v22.4S // ............................................................................................................................................................................................*........................................................................................... + sqrdmulh v23.4S, v20.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + str q21, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + mls v24.4S, v28.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + mls v9.4S, v18.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + sqrdmulh v20.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + sqrdmulh v28.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + cmge v13.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v18.4S, v9.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + cmge v16.4S, v31.4S, v9.4S // ........................................................................................................................................................................................*............................................................................................... + mls v21.4S, v20.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + cmge v20.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + sub v18.4S, v16.4S, v18.4S // ..........................................................................................................................................................................................*............................................................................................. + mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sub v23.4S, v20.4S, v13.4S // ......................................................................................................................................................................................................................................................*................................. + cmge v20.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................*............................................................................... + cmge v16.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + cmge v13.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v17.4S, v23.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + cmge v23.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + sub v13.4S, v13.4S, v16.4S // ......................................................................................................................................................................................................................................................................*................. + sqrdmulh v16.4S, v27.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + sub v20.4S, v20.4S, v23.4S // ..........................................................................................................................................................................................................*............................................................................. + cmge v23.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................*........................................................................... + mls v21.4S, v13.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + mul v17.4S, v27.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v24.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + sqrdmulh v27.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + str q21, [x1, #320] // .....................................................................................................................................................................................................................................................................................*.. + mls v17.4S, v16.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + cmge v13.4S, v22.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + str q24, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + cmge v20.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + sqrdmulh v21.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + sub v13.4S, v14.4S, v13.4S // ..............................................................................................................................................................................................*......................................................................................... + mul v14.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + ldr q10, [x1, #336] // .....e.................................................................................................................................................................................................................................................................................. + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + sub v23.4S, v23.4S, v20.4S // ..............................................................................................................................................................................................................*......................................................................... + mul v8.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sqrdmulh v20.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v14.4S, v21.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mls v19.4S, v23.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + mls v16.4S, v28.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + sqrdmulh v28.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v8.4S, v20.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + str q19, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + cmge v11.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + cmge v21.4S, v17.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + cmge v23.4S, v8.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + mls v22.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + cmge v15.4S, v31.4S, v8.4S // ............................................................................................................................................................................................................................................................................*........... + mls v19.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mls v9.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + ldr q13, [x1, #272] // ....e................................................................................................................................................................................................................................................................................... + cmge v28.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v18.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + str q9, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + cmge v12.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + str q22, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + cmge v24.4S, v31.4S, v17.4S // ........................................................................................................................................................................................................................................................................*............... + sub v15.4S, v15.4S, v23.4S // ..............................................................................................................................................................................................................................................................................*......... + ldr q23, [x1, #400] // ......e................................................................................................................................................................................................................................................................................. + sub v9.4S, v11.4S, v28.4S // ..................................................................................................................................................................................................................................................*..................................... + ldr q28, [x1, #464] // .......e................................................................................................................................................................................................................................................................................ + mls v20.4S, v27.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + sub v11.4S, v24.4S, v21.4S // ..........................................................................................................................................................................................................................................................................*............. + ldr q27, [x1, #528] // ........e............................................................................................................................................................................................................................................................................... + mls v16.4S, v9.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + add v9.4S, v13.4S, v10.4S // ...........................e............................................................................................................................................................................................................................................................ + mls v17.4S, v11.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + sub v22.4S, v13.4S, v10.4S // ..........................e............................................................................................................................................................................................................................................................. + ldr q10, [x1, #592] // .........e.............................................................................................................................................................................................................................................................................. + cmge v13.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v21.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + add v11.4S, v23.4S, v28.4S // ................................e....................................................................................................................................................................................................................................................... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v24.4S, v23.4S, v28.4S // ...............................e........................................................................................................................................................................................................................................................ + sub v28.4S, v13.4S, v21.4S // ..................................................................................................................................................................................................................................................................*..................... + str q17, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + add v17.4S, v9.4S, v11.4S // ...................................................................e.................................................................................................................................................................................................................... + add v21.4S, v27.4S, v10.4S // .....................................e.................................................................................................................................................................................................................................................. + sqrdmulh v23.4S, v22.4S, v4.S[3] // .............................e.......................................................................................................................................................................................................................................................... + mul v13.4S, v22.4S, v4.S[2] // ............................e........................................................................................................................................................................................................................................................... + ldr q16, [x1, #896] // ..............e......................................................................................................................................................................................................................................................................... + ldr q22, [x1, #960] // ...............e........................................................................................................................................................................................................................................................................ + mls v8.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + sub v15.4S, v12.4S, v18.4S // ..............................................................................................................................................................................................................................................................*......................... + ldr q12, [x1, #128] // ..e..................................................................................................................................................................................................................................................................................... + mul v18.4S, v24.4S, v5.S[0] // .................................e...................................................................................................................................................................................................................................................... + sqrdmulh v24.4S, v24.4S, v5.S[1] // ..................................e..................................................................................................................................................................................................................................................... + mls v13.4S, v23.4S, v29.4S // ..............................e......................................................................................................................................................................................................................................................... + mls v19.4S, v15.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + sub v23.4S, v16.4S, v22.4S // ...................................................e.................................................................................................................................................................................................................................... + add v22.4S, v16.4S, v22.4S // ....................................................e................................................................................................................................................................................................................................... + str q8, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + ldr q16, [x1, #192] // ...e.................................................................................................................................................................................................................................................................................... + mls v20.4S, v28.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + mls v18.4S, v24.4S, v29.4S // ...................................e.................................................................................................................................................................................................................................................... + ldr q15, [x1, #64] // .e...................................................................................................................................................................................................................................................................................... + sub v28.4S, v9.4S, v11.4S // ..................................................................e..................................................................................................................................................................................................................... + sub v11.4S, v27.4S, v10.4S // ....................................e................................................................................................................................................................................................................................................... + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + ldr q9, [x1, #0] // e....................................................................................................................................................................................................................................................................................... + cmge v8.4S, v31.4S, v14.4S // ........................................................................................................................................................................................................................................................*............................... + cmge v24.4S, v14.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + add v19.4S, v13.4S, v18.4S // ........................................................................e............................................................................................................................................................................................................... + sub v10.4S, v13.4S, v18.4S // .......................................................................e................................................................................................................................................................................................................ + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + sub v18.4S, v8.4S, v24.4S // ..........................................................................................................................................................................................................................................................*............................. + add v13.4S, v12.4S, v16.4S // ......................e................................................................................................................................................................................................................................................................. + ldr q20, [x1, #640] // ..........e............................................................................................................................................................................................................................................................................. + ldr q8, [x1, #704] // ...........e............................................................................................................................................................................................................................................................................ + sub v24.4S, v9.4S, v15.4S // ................e....................................................................................................................................................................................................................................................................... + sqrdmulh v27.4S, v10.4S, v2.S[1] // ..........................................................................e............................................................................................................................................................................................................. + mls v14.4S, v18.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + add v15.4S, v9.4S, v15.4S // .................e...................................................................................................................................................................................................................................................................... + sqrdmulh v9.4S, v24.4S, v3.S[3] // ...................e.................................................................................................................................................................................................................................................................... + mul v18.4S, v24.4S, v3.S[2] // ..................e..................................................................................................................................................................................................................................................................... + mul v24.4S, v10.4S, v2.S[0] // .........................................................................e.............................................................................................................................................................................................................. + str q14, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + sub v10.4S, v20.4S, v8.4S // .........................................e.............................................................................................................................................................................................................................................. + sqrdmulh v14.4S, v28.4S, v2.S[1] // .....................................................................e.................................................................................................................................................................................................................. + add v8.4S, v20.4S, v8.4S // ..........................................e............................................................................................................................................................................................................................................. + sub v20.4S, v15.4S, v13.4S // ........................................................e............................................................................................................................................................................................................................... + mls v24.4S, v27.4S, v29.4S // ...........................................................................e............................................................................................................................................................................................................ + add v27.4S, v15.4S, v13.4S // .........................................................e.............................................................................................................................................................................................................................. + mls v18.4S, v9.4S, v29.4S // ....................e................................................................................................................................................................................................................................................................... + sub v16.4S, v12.4S, v16.4S // .....................e.................................................................................................................................................................................................................................................................. + mul v13.4S, v28.4S, v2.S[0] // ....................................................................e................................................................................................................................................................................................................... + sqrdmulh v15.4S, v10.4S, v6.S[1] // ............................................e........................................................................................................................................................................................................................................... + mul v28.4S, v16.4S, v4.S[0] // .......................e................................................................................................................................................................................................................................................................ + sqrdmulh v16.4S, v16.4S, v4.S[1] // ........................e............................................................................................................................................................................................................................................................... + mul v9.4S, v10.4S, v6.S[0] // ...........................................e............................................................................................................................................................................................................................................ + sqrdmulh v12.4S, v23.4S, v7.S[1] // ......................................................e................................................................................................................................................................................................................................. + mul v23.4S, v23.4S, v7.S[0] // .....................................................e.................................................................................................................................................................................................................................. + mls v13.4S, v14.4S, v29.4S // ......................................................................e................................................................................................................................................................................................................. + mls v28.4S, v16.4S, v29.4S // .........................e.............................................................................................................................................................................................................................................................. + sub v16.4S, v18.4S, v28.4S // .............................................................e.......................................................................................................................................................................................................................... + add v28.4S, v18.4S, v28.4S // ..............................................................e......................................................................................................................................................................................................................... + ldr q18, [x1, #832] // .............e.......................................................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v11.4S, v5.S[3] // .......................................e................................................................................................................................................................................................................................................ + mls v23.4S, v12.4S, v29.4S // .......................................................e................................................................................................................................................................................................................................ + mul v14.4S, v11.4S, v5.S[2] // ......................................e................................................................................................................................................................................................................................................. + sub v12.4S, v28.4S, v19.4S // .....................................................................................................e.................................................................................................................................................................................. + mls v9.4S, v15.4S, v29.4S // .............................................e.......................................................................................................................................................................................................................................... + ldr q15, [x1, #768] // ............e........................................................................................................................................................................................................................................................................... + add v11.4S, v28.4S, v19.4S // ......................................................................................................e................................................................................................................................................................................. + mul v19.4S, v20.4S, v1.S[2] // ..........................................................e............................................................................................................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v1.S[3] // ...........................................................e............................................................................................................................................................................................................................ + mls v14.4S, v10.4S, v29.4S // ........................................e............................................................................................................................................................................................................................................... + sqrdmulh v28.4S, v16.4S, v1.S[3] // ................................................................e....................................................................................................................................................................................................................... + mul v16.4S, v16.4S, v1.S[2] // ...............................................................e........................................................................................................................................................................................................................ + sub v10.4S, v15.4S, v18.4S // ..............................................e......................................................................................................................................................................................................................................... + mls v19.4S, v20.4S, v29.4S // ............................................................e........................................................................................................................................................................................................................... + sub v20.4S, v14.4S, v9.4S // .................................................................................e...................................................................................................................................................................................................... + add v9.4S, v14.4S, v9.4S // ..................................................................................e..................................................................................................................................................................................................... + mul v14.4S, v10.4S, v6.S[2] // ................................................e....................................................................................................................................................................................................................................... + sqrdmulh v10.4S, v10.4S, v6.S[3] // .................................................e...................................................................................................................................................................................................................................... + mls v16.4S, v28.4S, v29.4S // .................................................................e...................................................................................................................................................................................................................... + add v28.4S, v27.4S, v17.4S // .................................................................................................e...................................................................................................................................................................................... + add v15.4S, v15.4S, v18.4S // ...............................................e........................................................................................................................................................................................................................................ + sub v17.4S, v27.4S, v17.4S // ................................................................................................e....................................................................................................................................................................................... + mls v14.4S, v10.4S, v29.4S // ..................................................e..................................................................................................................................................................................................................................... + add v10.4S, v19.4S, v13.4S // ...........................................................................................................e............................................................................................................................................................................ + sub v18.4S, v19.4S, v13.4S // ..........................................................................................................e............................................................................................................................................................................. + add v13.4S, v14.4S, v23.4S // ............................................................................................e........................................................................................................................................................................................... + sub v27.4S, v14.4S, v23.4S // ...........................................................................................e............................................................................................................................................................................................ + sub v23.4S, v21.4S, v8.4S // ............................................................................e........................................................................................................................................................................................................... + add v8.4S, v21.4S, v8.4S // .............................................................................e.......................................................................................................................................................................................................... + sub v19.4S, v16.4S, v24.4S // ...............................................................................................................e........................................................................................................................................................................ + add v21.4S, v16.4S, v24.4S // ................................................................................................................e....................................................................................................................................................................... + mul v14.4S, v18.4S, v0.S[2] // ............................................................................................................e........................................................................................................................................................................... + sqrdmulh v24.4S, v18.4S, v0.S[3] // .............................................................................................................e.......................................................................................................................................................................... + add v18.4S, v9.4S, v13.4S // ..........................................................................................................................e............................................................................................................................................................. + sub v9.4S, v9.4S, v13.4S // .........................................................................................................................e.............................................................................................................................................................. + mul v13.4S, v20.4S, v2.S[2] // ...................................................................................e.................................................................................................................................................................................................... + sqrdmulh v16.4S, v20.4S, v2.S[3] // ....................................................................................e................................................................................................................................................................................................... + sub v20.4S, v15.4S, v22.4S // ......................................................................................e................................................................................................................................................................................................. + add v22.4S, v15.4S, v22.4S // .......................................................................................e................................................................................................................................................................................................ + mul v15.4S, v9.4S, v1.S[0] // ...........................................................................................................................e............................................................................................................................................................ + sqrdmulh v9.4S, v9.4S, v1.S[1] // ............................................................................................................................e........................................................................................................................................................... + mls v14.4S, v24.4S, v29.4S // ..............................................................................................................e......................................................................................................................................................................... + mul v24.4S, v12.4S, v0.S[2] // .......................................................................................................e................................................................................................................................................................................ + mls v13.4S, v16.4S, v29.4S // .....................................................................................e.................................................................................................................................................................................................. + mul v16.4S, v19.4S, v0.S[2] // .................................................................................................................e...................................................................................................................................................................... + mls v15.4S, v9.4S, v29.4S // .............................................................................................................................e.......................................................................................................................................................... + sqrdmulh v12.4S, v12.4S, v0.S[3] // ........................................................................................................e............................................................................................................................................................................... + sqrdmulh v19.4S, v19.4S, v0.S[3] // ..................................................................................................................e..................................................................................................................................................................... + add v9.4S, v11.4S, v18.4S // ..............................................................................................................................................e......................................................................................................................................... + sub v11.4S, v11.4S, v18.4S // .............................................................................................................................................e.......................................................................................................................................... + mul v18.4S, v23.4S, v2.S[2] // ..............................................................................e......................................................................................................................................................................................................... + sqrdmulh v23.4S, v23.4S, v2.S[3] // ...............................................................................e........................................................................................................................................................................................................ + mls v16.4S, v19.4S, v29.4S // ...................................................................................................................e.................................................................................................................................................................... + mls v24.4S, v12.4S, v29.4S // .........................................................................................................e.............................................................................................................................................................................. + mls v18.4S, v23.4S, v29.4S // ................................................................................e....................................................................................................................................................................................................... + mul v12.4S, v17.4S, v0.S[2] // ..................................................................................................e..................................................................................................................................................................................... + sub v19.4S, v8.4S, v22.4S // ....................................................................................................................e................................................................................................................................................................... + add v8.4S, v8.4S, v22.4S // .....................................................................................................................e.................................................................................................................................................................. + sqrdmulh v22.4S, v17.4S, v0.S[3] // ...................................................................................................e.................................................................................................................................................................................... + sqrdmulh v23.4S, v19.4S, v1.S[1] // .......................................................................................................................e................................................................................................................................................................ + mul v19.4S, v19.4S, v1.S[0] // ......................................................................................................................e................................................................................................................................................................. + mul v17.4S, v27.4S, v3.S[0] // .............................................................................................e.......................................................................................................................................................................................... + sqrdmulh v27.4S, v27.4S, v3.S[1] // ..............................................................................................e......................................................................................................................................................................................... + mls v12.4S, v22.4S, v29.4S // ....................................................................................................e................................................................................................................................................................................... + sub v22.4S, v28.4S, v8.4S // ........................................................................................................................................e............................................................................................................................................... + mls v19.4S, v23.4S, v29.4S // ........................................................................................................................e............................................................................................................................................................... + add v8.4S, v28.4S, v8.4S // .........................................................................................................................................e.............................................................................................................................................. + sqrdmulh v28.4S, v11.4S, v0.S[1] // ................................................................................................................................................e....................................................................................................................................... + mls v17.4S, v27.4S, v29.4S // ...............................................................................................e........................................................................................................................................................................................ + mul v27.4S, v11.4S, v0.S[0] // ...............................................................................................................................................e........................................................................................................................................ + sqrdmulh v11.4S, v20.4S, v3.S[1] // .........................................................................................e.............................................................................................................................................................................................. + sub v23.4S, v12.4S, v19.4S // ............................................................................................................................................................e........................................................................................................................... + add v12.4S, v12.4S, v19.4S // .............................................................................................................................................................e.......................................................................................................................... + mul v19.4S, v20.4S, v3.S[0] // ........................................................................................e............................................................................................................................................................................................... + sub v20.4S, v13.4S, v17.4S // ...................................................................................................................................e.................................................................................................................................................... + mls v27.4S, v28.4S, v29.4S // .................................................................................................................................................e...................................................................................................................................... + add v17.4S, v13.4S, v17.4S // ....................................................................................................................................e................................................................................................................................................... + sqrdmulh v13.4S, v23.4S, v0.S[1] // ...............................................................................................................................................................e........................................................................................................................ + sqrdmulh v28.4S, v20.4S, v1.S[1] // ......................................................................................................................................e................................................................................................................................................. + mls v19.4S, v11.4S, v29.4S // ..........................................................................................e............................................................................................................................................................................................. + mul v20.4S, v20.4S, v1.S[0] // .....................................................................................................................................e.................................................................................................................................................. + + // original source code + // ldr q8, [x1, #0] // .....................................................................e......................................................................................................................|................................................................................................................................................................e................. + // ldr q9, [x1, #(1*(512/8))] // .................................................................e..........................................................................................................................|............................................................................................................................................................e..................... + // ldr q10, [x1, #(2*(512/8))] // ......................................................e.....................................................................................................................................|.................................................................................................................................................e................................ + // ldr q11, [x1, #(3*(512/8))] // ..............................................................e.............................................................................................................................|.........................................................................................................................................................e........................ + // ldr q12, [x1, #(4*(512/8))] // ....................e.......................................................................................................................................................................|...............................................................................................................e.................................................................. + // ldr q13, [x1, #(5*(512/8))] // e...........................................................................................................................................................................................|...........................................................................................e...................................................................................... + // ldr q14, [x1, #(6*(512/8))] // ............................e...............................................................................................................................................................|.......................................................................................................................e.......................................................... + // ldr q15, [x1, #(7*(512/8))] // ..............................e.............................................................................................................................................................|.........................................................................................................................e........................................................ + // ldr q16, [x1, #(8*(512/8))] // .................................e..........................................................................................................................................................|............................................................................................................................e..................................................... + // ldr q17, [x1, #(9*(512/8))] // ......................................e.....................................................................................................................................................|.................................................................................................................................e................................................ + // ldr q18, [x1, #(10*(512/8))] // .............................................................................e..............................................................................................................|........................................................................................................................................................................e......... + // ldr q19, [x1, #(11*(512/8))] // ..............................................................................e.............................................................................................................|.........................................................................................................................................................................e........ + // ldr q20, [x1, #(12*(512/8))] // ................................................................................................................e...........................................................................|.................................................................................................................................................................................. + // ldr q21, [x1, #(13*(512/8))] // ..........................................................................................................e.................................................................................|.................................................................................................................................................................................. + // ldr q22, [x1, #(14*(512/8))] // ..................................................e.........................................................................................................................................|.............................................................................................................................................e.................................... + // ldr q23, [x1, #(15*(512/8))] // ...................................................e........................................................................................................................................|..............................................................................................................................................e................................... + // sub v24.4s, v8.4s, v9.4s // ...............................................................................e............................................................................................................|..........................................................................................................................................................................e....... + // add v8.4s, v8.4s, v9.4s // ..................................................................................e.........................................................................................................|.............................................................................................................................................................................e.... + // mul v9.4s, v24.4s, v3.s[2] // ....................................................................................e.......................................................................................................|...............................................................................................................................................................................e.. + // sqrdmulh v24.4s, v24.4s, v3.s[3] // ...................................................................................e........................................................................................................|..............................................................................................................................................................................e... + // mls v9.4s, v24.4s, v29.4s // .............................................................................................e..............................................................................................|.................................................................................................................................................................................. + // sub v24.4s, v10.4s, v11.4s // ..............................................................................................e.............................................................................................|.................................................................................................................................................................................. + // add v10.4s, v10.4s, v11.4s // ............................................................................e...............................................................................................................|.......................................................................................................................................................................e.......... + // mul v11.4s, v24.4s, v4.s[0] // .................................................................................................e..........................................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.s[1] // ..................................................................................................e.........................................................................................|.................................................................................................................................................................................. + // mls v11.4s, v24.4s, v29.4s // .......................................................................................................e....................................................................................|.................................................................................................................................................................................. + // sub v24.4s, v12.4s, v13.4s // .....................................e......................................................................................................................................................|................................................................................................................................e................................................. + // add v12.4s, v12.4s, v13.4s // ...................................e........................................................................................................................................................|..............................................................................................................................e................................................... + // mul v13.4s, v24.4s, v4.s[2] // .................................................e..........................................................................................................................................|............................................................................................................................................e..................................... + // sqrdmulh v24.4s, v24.4s, v4.s[3] // ................................................e...........................................................................................................................................|...........................................................................................................................................e...................................... + // mls v13.4s, v24.4s, v29.4s // .........................................................e..................................................................................................................................|....................................................................................................................................................e............................. + // sub v24.4s, v14.4s, v15.4s // ...........................................e................................................................................................................................................|......................................................................................................................................e........................................... + // add v14.4s, v14.4s, v15.4s // .........................................e..................................................................................................................................................|....................................................................................................................................e............................................. + // mul v15.4s, v24.4s, v5.s[0] // .......................................................e....................................................................................................................................|..................................................................................................................................................e............................... + // sqrdmulh v24.4s, v24.4s, v5.s[1] // ........................................................e...................................................................................................................................|...................................................................................................................................................e.............................. + // mls v15.4s, v24.4s, v29.4s // ................................................................e...........................................................................................................................|...........................................................................................................................................................e...................... + // sub v24.4s, v16.4s, v17.4s // ...................................................................e........................................................................................................................|..............................................................................................................................................................e................... + // add v16.4s, v16.4s, v17.4s // ...............................................e............................................................................................................................................|..........................................................................................................................................e....................................... + // mul v17.4s, v24.4s, v5.s[2] // .............................................................................................................e..............................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v5.s[3] // ...........................................................................................................e................................................................................|.................................................................................................................................................................................. + // mls v17.4s, v24.4s, v29.4s // ....................................................................................................................e.......................................................................|.................................................................................................................................................................................. + // sub v24.4s, v18.4s, v19.4s // .......................................................................................e....................................................................................................|.................................................................................................................................................................................. + // add v18.4s, v18.4s, v19.4s // .........................................................................................e..................................................................................................|.................................................................................................................................................................................. + // mul v19.4s, v24.4s, v6.s[0] // ...................................................................................................e........................................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.s[1] // ................................................................................................e...........................................................................................|.................................................................................................................................................................................. + // mls v19.4s, v24.4s, v29.4s // ...............................................................................................................e............................................................................|.................................................................................................................................................................................. + // sub v24.4s, v20.4s, v21.4s // .......................................................................................................................e....................................................................|.................................................................................................................................................................................. + // add v20.4s, v20.4s, v21.4s // ...............................................................................................................................e............................................................|.................................................................................................................................................................................. + // mul v21.4s, v24.4s, v6.s[2] // ...........................................................................................................................e................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.s[3] // ............................................................................................................................e...............................................................|.................................................................................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // .................................................................................................................................e..........................................................|.................................................................................................................................................................................. + // sub v24.4s, v22.4s, v23.4s // ...........................................................e................................................................................................................................|......................................................................................................................................................e........................... + // add v22.4s, v22.4s, v23.4s // ............................................................e...............................................................................................................................|.......................................................................................................................................................e.......................... + // mul v23.4s, v24.4s, v7.s[0] // .....................................................................................................e......................................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v7.s[1] // ....................................................................................................e.......................................................................................|.................................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................e...............................................................................|.................................................................................................................................................................................. + // sub v24.4s, v8.4s, v10.4s // ..........................................................................................e.................................................................................................|.................................................................................................................................................................................. + // add v8.4s, v8.4s, v10.4s // ............................................................................................e...............................................................................................|.................................................................................................................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // ..................................................................................................................e.........................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................................................................................................e........................................................................|.................................................................................................................................................................................. + // mls v10.4s, v24.4s, v29.4s // ........................................................................................................................e...................................................................|.................................................................................................................................................................................. + // sub v24.4s, v9.4s, v11.4s // ........................................................................................................e...................................................................................|.................................................................................................................................................................................. + // add v9.4s, v9.4s, v11.4s // .........................................................................................................e..................................................................................|.................................................................................................................................................................................. + // mul v11.4s, v24.4s, v1.s[2] // ......................................................................................................................e.....................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................................................................................................................e......................................................................|.................................................................................................................................................................................. + // mls v11.4s, v24.4s, v29.4s // .............................................................................................................................e..............................................................|.................................................................................................................................................................................. + // sub v24.4s, v12.4s, v14.4s // ..................................................................e.........................................................................................................................|.............................................................................................................................................................e.................... + // add v12.4s, v12.4s, v14.4s // ..............................................e.............................................................................................................................................|.........................................................................................................................................e........................................ + // mul v14.4s, v24.4s, v2.s[0] // ...............................................................................................e............................................................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ........................................................................................e...................................................................................................|.................................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // ......................................................................................................e.....................................................................................|.................................................................................................................................................................................. + // sub v24.4s, v13.4s, v15.4s // .........................................................................e..................................................................................................................|....................................................................................................................................................................e............. + // add v13.4s, v13.4s, v15.4s // ........................................................................e...................................................................................................................|...................................................................................................................................................................e.............. + // mul v15.4s, v24.4s, v2.s[0] // .....................................................................................e......................................................................................................|................................................................................................................................................................................e. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................e...........................................................................................................|...........................................................................................................................................................................e...... + // mls v15.4s, v24.4s, v29.4s // ...........................................................................................e................................................................................................|.................................................................................................................................................................................. + // sub v24.4s, v16.4s, v18.4s // ......................................................................................................................................e.....................................................|.................................................................................................................................................................................. + // add v16.4s, v16.4s, v18.4s // .......................................................................................................................................e....................................................|.................................................................................................................................................................................. + // mul v18.4s, v24.4s, v2.s[2] // .............................................................................................................................................................e..............................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................................................................................e.............................|.................................................................................................................................................................................. + // mls v18.4s, v24.4s, v29.4s // .................................................................................................................................................................e..........................|.................................................................................................................................................................................. + // sub v24.4s, v17.4s, v19.4s // .........................................................................................................................e..................................................................|.................................................................................................................................................................................. + // add v17.4s, v17.4s, v19.4s // ..........................................................................................................................e.................................................................|.................................................................................................................................................................................. + // mul v19.4s, v24.4s, v2.s[2] // ..............................................................................................................................................e.............................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...............................................................................................................................................e............................................|.................................................................................................................................................................................. + // mls v19.4s, v24.4s, v29.4s // ......................................................................................................................................................e.....................................|.................................................................................................................................................................................. + // sub v24.4s, v20.4s, v22.4s // ................................................................................................................................................e...........................................|.................................................................................................................................................................................. + // add v20.4s, v20.4s, v22.4s // .................................................................................................................................................e..........................................|.................................................................................................................................................................................. + // mul v22.4s, v24.4s, v3.s[0] // ....................................................................................................................................................................................e.......|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .................................................................................................................................................................................e..........|.................................................................................................................................................................................. + // mls v22.4s, v24.4s, v29.4s // ..........................................................................................................................................................................................e.|.................................................................................................................................................................................. + // sub v24.4s, v21.4s, v23.4s // .....................................................................................................................................e......................................................|.................................................................................................................................................................................. + // add v21.4s, v21.4s, v23.4s // ....................................................................................................................................e.......................................................|.................................................................................................................................................................................. + // mul v23.4s, v24.4s, v3.s[0] // ........................................................................................................................................................................e...................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................................................................................................................................e..................|.................................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ...............................................................................................................................................................................e............|.................................................................................................................................................................................. + // sub v24.4s, v8.4s, v12.4s // ................................................................................................................................e...........................................................|.................................................................................................................................................................................. + // add v8.4s, v8.4s, v12.4s // ..............................................................................................................................e.............................................................|.................................................................................................................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ..................................................................................................................................................................e.........................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................................................................................................e......................|.................................................................................................................................................................................. + // mls v12.4s, v24.4s, v29.4s // ..........................................................................................................................................................................e.................|.................................................................................................................................................................................. + // sub v24.4s, v9.4s, v13.4s // ..............................................................................................................e.............................................................................|.................................................................................................................................................................................. + // add v9.4s, v9.4s, v13.4s // .................................................................................................................e..........................................................................|.................................................................................................................................................................................. + // mul v13.4s, v24.4s, v0.s[2] // .....................................................................................................................................................e......................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................................................e..................................|.................................................................................................................................................................................. + // mls v13.4s, v24.4s, v29.4s // ................................................................................................................................................................e...........................|.................................................................................................................................................................................. + // sub v24.4s, v10.4s, v14.4s // ...................................................................................................................................e........................................................|.................................................................................................................................................................................. + // add v10.4s, v10.4s, v14.4s // ..................................................................................................................................e.........................................................|.................................................................................................................................................................................. + // mul v14.4s, v24.4s, v0.s[2] // ..........................................................................................................................................e.................................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................................................................................................................................e................................................|.................................................................................................................................................................................. + // mls v14.4s, v24.4s, v29.4s // ....................................................................................................................................................e.......................................|.................................................................................................................................................................................. + // sub v24.4s, v11.4s, v15.4s // ........................................................................................................................................e...................................................|.................................................................................................................................................................................. + // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................e..................................................|.................................................................................................................................................................................. + // mul v15.4s, v24.4s, v0.s[2] // .......................................................................................................................................................e....................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..........................................................................................................................................................e.................................|.................................................................................................................................................................................. + // mls v15.4s, v24.4s, v29.4s // ...............................................................................................................................................................e............................|.................................................................................................................................................................................. + // sub v24.4s, v16.4s, v20.4s // ...................................................................................................................................................................e........................|.................................................................................................................................................................................. + // add v16.4s, v16.4s, v20.4s // ....................................................................................................................................................................e.......................|.................................................................................................................................................................................. + // mul v20.4s, v24.4s, v1.s[0] // .......................................................................................................................................................................e....................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ......................................................................................................................................................................e.....................|.................................................................................................................................................................................. + // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................e...............|.................................................................................................................................................................................. + // sub v24.4s, v17.4s, v21.4s // .............................................................................................................................................e..............................................|.................................................................................................................................................................................. + // add v17.4s, v17.4s, v21.4s // ............................................................................................................................................e...............................................|.................................................................................................................................................................................. + // mul v21.4s, v24.4s, v1.s[0] // ..................................................................................................................................................e.........................................|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................................e........................................|.................................................................................................................................................................................. + // mls v21.4s, v24.4s, v29.4s // ........................................................................................................................................................e...................................|.................................................................................................................................................................................. + // sub v24.4s, v18.4s, v22.4s // ............................................................................................................................................................................................|...*.............................................................................................................................................................................. + // add v18.4s, v18.4s, v22.4s // ............................................................................................................................................................................................|.....*............................................................................................................................................................................ + // mul v22.4s, v24.4s, v1.s[0] // ............................................................................................................................................................................................|.....................*............................................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ............................................................................................................................................................................................|............................*..................................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|..................................*............................................................................................................................................... + // sub v24.4s, v19.4s, v23.4s // .....................................................................................................................................................................................e......|.................................................................................................................................................................................. + // add v19.4s, v19.4s, v23.4s // .......................................................................................................................................................................................e....|.................................................................................................................................................................................. + // mul v23.4s, v24.4s, v1.s[0] // ...........................................................................................................................................................................................e|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................................................................................................e..|.................................................................................................................................................................................. + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................*.................................................................................................................................................................................. + // sub v24.4s, v8.4s, v16.4s // ...........................................................................................................................................................................e................|.................................................................................................................................................................................. + // add v8.4s, v8.4s, v16.4s // .............................................................................................................................................................................e..............|.................................................................................................................................................................................. + // mul v16.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|...............*.................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|.................*................................................................................................................................................................ + // mls v16.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|......................*........................................................................................................................................................... + // sub v24.4s, v9.4s, v17.4s // ............................................................................................................................................................e...............................|.................................................................................................................................................................................. + // add v9.4s, v9.4s, v17.4s // ...........................................................................................................................................................e................................|.................................................................................................................................................................................. + // mul v17.4s, v24.4s, v0.s[0] // ................................................................................................................................................................................e...........|.................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................................................................................................e.............|.................................................................................................................................................................................. + // mls v17.4s, v24.4s, v29.4s // ......................................................................................................................................................................................e.....|.................................................................................................................................................................................. + // sub v24.4s, v10.4s, v18.4s // ............................................................................................................................................................................................|.............................................*.................................................................................................................................... + // add v10.4s, v10.4s, v18.4s // ............................................................................................................................................................................................|..............................................*................................................................................................................................... + // mul v18.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|................................................*................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|.................................................*................................................................................................................................ + // mls v18.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|.........................................................*........................................................................................................................ + // sub v24.4s, v11.4s, v19.4s // ............................................................................................................................................................................................|..*............................................................................................................................................................................... + // add v11.4s, v11.4s, v19.4s // ............................................................................................................................................................................................|.*................................................................................................................................................................................ + // mul v19.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|..................*............................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|...................*.............................................................................................................................................................. + // mls v19.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|........................*......................................................................................................................................................... + // sub v24.4s, v12.4s, v20.4s // ..................................................................................................................................................................................e.........|.................................................................................................................................................................................. + // add v12.4s, v12.4s, v20.4s // ...................................................................................................................................................................................e........|.................................................................................................................................................................................. + // mul v20.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|*................................................................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................................................................................................................................................e...|.................................................................................................................................................................................. + // mls v20.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|......*........................................................................................................................................................................... + // sub v24.4s, v13.4s, v21.4s // ............................................................................................................................................................................................|....*............................................................................................................................................................................. + // add v13.4s, v13.4s, v21.4s // ............................................................................................................................................................................................|.........*........................................................................................................................................................................ + // mul v21.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|........*......................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|.......*.......................................................................................................................................................................... + // mls v21.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|.............*.................................................................................................................................................................... + // sub v24.4s, v14.4s, v22.4s // ............................................................................................................................................................................................|........................................*......................................................................................................................................... + // add v14.4s, v14.4s, v22.4s // ............................................................................................................................................................................................|.........................................*........................................................................................................................................ + // mul v22.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|..................................................*............................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|............................................*..................................................................................................................................... + // mls v22.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|........................................................*......................................................................................................................... + // sub v24.4s, v15.4s, v23.4s // ............................................................................................................................................................................................|...........*...................................................................................................................................................................... + // add v15.4s, v15.4s, v23.4s // ............................................................................................................................................................................................|..........*....................................................................................................................................................................... + // mul v23.4s, v24.4s, v0.s[0] // ............................................................................................................................................................................................|...................................................*.............................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ............................................................................................................................................................................................|......................................................*........................................................................................................................... + // mls v23.4s, v24.4s, v29.4s // ............................................................................................................................................................................................|...................................................................*.............................................................................................................. + // cmge v27.4s, v31.4s, v16.4s // ............................................................................................................................................................................................|................................*................................................................................................................................................. + // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................................................................................................|......................................*........................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|...........................................*...................................................................................................................................... + // mls v16.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|...............................................*.................................................................................................................................. + // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................|.......................*.......................................................................................................................................................... + // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................................|.........................*........................................................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|...........................*...................................................................................................................................................... + // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|...............................*.................................................................................................................................................. + // cmge v27.4s, v31.4s, v18.4s // ............................................................................................................................................................................................|...............................................................*.................................................................................................................. + // cmge v28.4s, v18.4s, v30.4s // ............................................................................................................................................................................................|..............................................................*................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|..................................................................*............................................................................................................... + // mls v18.4s, v28.4s, v29.4s // ...................*........................................................................................................................................................................|..............................................................................................................*................................................................... + // cmge v27.4s, v31.4s, v19.4s // ............................................................................................................................................................................................|.....................................................*............................................................................................................................ + // cmge v28.4s, v19.4s, v30.4s // ............................................................................................................................................................................................|.....................................................................................*............................................................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|.........................................................................................*........................................................................................ + // mls v19.4s, v28.4s, v29.4s // ................*...........................................................................................................................................................................|...........................................................................................................*...................................................................... + // cmge v27.4s, v31.4s, v20.4s // ............................................................................................................................................................................................|..............*................................................................................................................................................................... + // cmge v28.4s, v20.4s, v30.4s // ............................................................................................................................................................................................|............*..................................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|................*................................................................................................................................................................. + // mls v20.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|....................*............................................................................................................................................................. + // cmge v27.4s, v31.4s, v21.4s // ............................................................................................................................................................................................|..............................*................................................................................................................................................... + // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................................|.............................*.................................................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|.................................*................................................................................................................................................ + // mls v21.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|.....................................*............................................................................................................................................ + // cmge v27.4s, v31.4s, v22.4s // ............................................................................................................................................................................................|.....................................................................*............................................................................................................ + // cmge v28.4s, v22.4s, v30.4s // ............................................................................................................................................................................................|.........................................................................*........................................................................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|............................................................................*..................................................................................................... + // mls v22.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|.................................................................................*................................................................................................ + // cmge v27.4s, v31.4s, v23.4s // ............................................................................................................................................................................................|.............................................................................*.................................................................................................... + // cmge v28.4s, v23.4s, v30.4s // ............................................................................................................................................................................................|.......................................................................................*.......................................................................................... + // sub v28.4s, v27.4s, v28.4s // ..*.........................................................................................................................................................................................|.............................................................................................*.................................................................................... + // mls v23.4s, v28.4s, v29.4s // ......*.....................................................................................................................................................................................|.................................................................................................*................................................................................ + // str q16, [x1, #(8*(512/8))] // ............................................................................................................................................................................................|....................................................*............................................................................................................................. + // str q17, [x1, #(9*(512/8))] // ............................................................................................................................................................................................|.......................................*.......................................................................................................................................... + // str q18, [x1, #(10*(512/8))] // .......................*....................................................................................................................................................................|..................................................................................................................*............................................................... + // str q19, [x1, #(11*(512/8))] // .........................*..................................................................................................................................................................|....................................................................................................................*............................................................. + // str q20, [x1, #(12*(512/8))] // ............................................................................................................................................................................................|..........................*....................................................................................................................................................... + // str q21, [x1, #(13*(512/8))] // ............................................................................................................................................................................................|.......................................................*.......................................................................................................................... + // str q22, [x1, #(14*(512/8))] // ............................................................................................................................................................................................|......................................................................................*........................................................................................... + // str q23, [x1, #(15*(512/8))] // ..........*.................................................................................................................................................................................|.....................................................................................................*............................................................................ + // mul v16.4s, v8.4s, v25.4s // .*..........................................................................................................................................................................................|............................................................................................*..................................................................................... + // sqrdmulh v8.4s, v8.4s, v26.4s // ............................................................................................................................................................................................|............................................................*..................................................................................................................... + // mls v16.4s, v8.4s, v29.4s // .......*....................................................................................................................................................................................|..................................................................................................*............................................................................... + // mul v17.4s, v9.4s, v25.4s // ............................................................................................................................................................................................|...................................*.............................................................................................................................................. + // sqrdmulh v9.4s, v9.4s, v26.4s // ............................................................................................................................................................................................|....................................*............................................................................................................................................. + // mls v17.4s, v9.4s, v29.4s // ............................................................................................................................................................................................|..........................................*....................................................................................................................................... + // mul v18.4s, v10.4s, v25.4s // ............................................................................................................................................................................................|..........................................................................................*....................................................................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ............................................................................................................................................................................................|........................................................................................*......................................................................................... + // mls v18.4s, v10.4s, v29.4s // .....*......................................................................................................................................................................................|................................................................................................*................................................................................. + // mul v19.4s, v11.4s, v25.4s // ............*...............................................................................................................................................................................|.......................................................................................................*.......................................................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ........*...................................................................................................................................................................................|...................................................................................................*.............................................................................. + // mls v19.4s, v11.4s, v29.4s // ..................*.........................................................................................................................................................................|.............................................................................................................*.................................................................... + // mul v20.4s, v12.4s, v25.4s // ...........*................................................................................................................................................................................|......................................................................................................*........................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ............................................................................................................................................................................................|..................................................................................*............................................................................................... + // mls v20.4s, v12.4s, v29.4s // ...............................*............................................................................................................................................................|..........................................................................................................................*....................................................... + // mul v21.4s, v13.4s, v25.4s // ............................................................................................................................................................................................|..........................................................*....................................................................................................................... + // sqrdmulh v13.4s, v13.4s, v26.4s // ............................................................................................................................................................................................|...........................................................*...................................................................................................................... + // mls v21.4s, v13.4s, v29.4s // ............................................................................................................................................................................................|................................................................*................................................................................................................. + // mul v22.4s, v14.4s, v25.4s // ............................................................................................................................................................................................|................................................................................*................................................................................................. + // sqrdmulh v14.4s, v14.4s, v26.4s // ............................................................................................................................................................................................|...........................................................................*...................................................................................................... + // mls v22.4s, v14.4s, v29.4s // ............................................................................................................................................................................................|....................................................................................*............................................................................................. + // mul v23.4s, v15.4s, v25.4s // ...*........................................................................................................................................................................................|..............................................................................................*................................................................................... + // sqrdmulh v15.4s, v15.4s, v26.4s // ....*.......................................................................................................................................................................................|...............................................................................................*.................................................................................. + // mls v23.4s, v15.4s, v29.4s // .........*..................................................................................................................................................................................|....................................................................................................*............................................................................. + // cmge v27.4s, v31.4s, v16.4s // .............*..............................................................................................................................................................................|........................................................................................................*......................................................................... + // cmge v28.4s, v16.4s, v30.4s // .....................*......................................................................................................................................................................|................................................................................................................*................................................................. + // sub v28.4s, v27.4s, v28.4s // .............................*..............................................................................................................................................................|........................................................................................................................*......................................................... + // mls v16.4s, v28.4s, v29.4s // ..................................*.........................................................................................................................................................|.............................................................................................................................*.................................................... + // cmge v27.4s, v31.4s, v17.4s // ............................................................................................................................................................................................|.................................................................*................................................................................................................ + // cmge v28.4s, v17.4s, v30.4s // ............................................................................................................................................................................................|.............................................................*.................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|....................................................................*............................................................................................................. + // mls v17.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|........................................................................*......................................................................................................... + // cmge v27.4s, v31.4s, v18.4s // ......................................................................*.....................................................................................................................|.................................................................................................................................................................*................ + // cmge v28.4s, v18.4s, v30.4s // .......................................................................*....................................................................................................................|..................................................................................................................................................................*............... + // sub v28.4s, v27.4s, v28.4s // ...........................................................................*................................................................................................................|......................................................................................................................................................................*........... + // mls v18.4s, v28.4s, v29.4s // .................................................................................*..........................................................................................................|............................................................................................................................................................................*..... + // cmge v27.4s, v31.4s, v19.4s // ........................*...................................................................................................................................................................|...................................................................................................................*.............................................................. + // cmge v28.4s, v19.4s, v30.4s // ......................*.....................................................................................................................................................................|.................................................................................................................*................................................................ + // sub v28.4s, v27.4s, v28.4s // .....................................................*......................................................................................................................................|................................................................................................................................................*................................. + // mls v19.4s, v28.4s, v29.4s // ..........................................................*.................................................................................................................................|.....................................................................................................................................................*............................ + // cmge v27.4s, v31.4s, v20.4s // .......................................*....................................................................................................................................................|..................................................................................................................................*............................................... + // cmge v28.4s, v20.4s, v30.4s // ........................................*...................................................................................................................................................|...................................................................................................................................*.............................................. + // sub v28.4s, v27.4s, v28.4s // ............................................*...............................................................................................................................................|.......................................................................................................................................*.......................................... + // mls v20.4s, v28.4s, v29.4s // ...............................................................*............................................................................................................................|..........................................................................................................................................................*....................... + // cmge v27.4s, v31.4s, v21.4s // ............................................................................................................................................................................................|.......................................................................*.......................................................................................................... + // cmge v28.4s, v21.4s, v30.4s // ............................................................................................................................................................................................|......................................................................*........................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ............................................................................................................................................................................................|..........................................................................*....................................................................................................... + // mls v21.4s, v28.4s, v29.4s // ............................................................................................................................................................................................|..............................................................................*................................................................................................... + // cmge v27.4s, v31.4s, v22.4s // ..........................*.................................................................................................................................................................|.....................................................................................................................*............................................................ + // cmge v28.4s, v22.4s, v30.4s // ..............*.............................................................................................................................................................................|.........................................................................................................*........................................................................ + // sub v28.4s, v27.4s, v28.4s // ................................*...........................................................................................................................................................|...........................................................................................................................*...................................................... + // mls v22.4s, v28.4s, v29.4s // ....................................*.......................................................................................................................................................|...............................................................................................................................*.................................................. + // cmge v27.4s, v31.4s, v23.4s // .................*..........................................................................................................................................................................|............................................................................................................*..................................................................... + // cmge v28.4s, v23.4s, v30.4s // ...............*............................................................................................................................................................................|..........................................................................................................*....................................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................*................................................................................................................................................................|......................................................................................................................*........................................................... + // mls v23.4s, v28.4s, v29.4s // ....................................................*.......................................................................................................................................|...............................................................................................................................................*.................................. + // str q16, [x1], #(16) // ..........................................*.................................................................................................................................................|.....................................................................................................................................*............................................ + // str q17, [x1, #(-16 + 1*(512/8))] // ............................................................................................................................................................................................|...............................................................................*.................................................................................................. + // str q18, [x1, #(-16 + 2*(512/8))] // ......................................................................................*.....................................................................................................|.................................................................................................................................................................................* + // str q19, [x1, #(-16 + 3*(512/8))] // ....................................................................*.......................................................................................................................|...............................................................................................................................................................*.................. + // str q20, [x1, #(-16 + 4*(512/8))] // ..........................................................................*.................................................................................................................|.....................................................................................................................................................................*............ + // str q21, [x1, #(-16 + 5*(512/8))] // ............................................................................................................................................................................................|...................................................................................*.............................................................................................. + // str q22, [x1, #(-16 + 6*(512/8))] // .............................................*..............................................................................................................................................|........................................................................................................................................*......................................... + // str q23, [x1, #(-16 + 7*(512/8))] // .............................................................*..............................................................................................................................|........................................................................................................................................................*......................... + + sub count, count, #1 + cbnz count, layer1234_start + mls v20.4S, v28.4S, v29.4S // .......................................................................................................................................*................................................................................................................................................ + mul v11.4S, v23.4S, v0.S[0] // ..............................................................................................................................................................*......................................................................................................................... + add v28.4S, v18.4S, v19.4S // ...............................................................................................................................*........................................................................................................................................................ + sub v23.4S, v18.4S, v19.4S // ..............................................................................................................................*......................................................................................................................................................... + cmge v19.4S, v27.4S, v30.4S // .....................................................................................................................................................................................*.................................................................................................. + cmge v18.4S, v31.4S, v27.4S // ....................................................................................................................................................................................*................................................................................................... + mls v11.4S, v13.4S, v29.4S // ................................................................................................................................................................*....................................................................................................................... + sub v13.4S, v10.4S, v28.4S // ..................................................................................................................................................*..................................................................................................................................... + add v10.4S, v10.4S, v28.4S // ...................................................................................................................................................*.................................................................................................................................... + sub v28.4S, v18.4S, v19.4S // ......................................................................................................................................................................................*................................................................................................. + mul v19.4S, v13.4S, v0.S[0] // ....................................................................................................................................................*................................................................................................................................... + sqrdmulh v13.4S, v13.4S, v0.S[1] // .....................................................................................................................................................*.................................................................................................................................. + sqrdmulh v18.4S, v23.4S, v1.S[1] // .................................................................................................................................*...................................................................................................................................................... + mul v23.4S, v23.4S, v1.S[0] // ................................................................................................................................*....................................................................................................................................................... + mls v27.4S, v28.4S, v29.4S // .......................................................................................................................................................................................*................................................................................................ + sub v28.4S, v24.4S, v15.4S // .................................................................................................................................................................*...................................................................................................................... + mls v19.4S, v13.4S, v29.4S // ......................................................................................................................................................*................................................................................................................................. + add v13.4S, v24.4S, v15.4S // ..................................................................................................................................................................*..................................................................................................................... + cmge v24.4S, v31.4S, v11.4S // ................................................................................................................................................................................................*....................................................................................... + mls v23.4S, v18.4S, v29.4S // ..................................................................................................................................*..................................................................................................................................................... + sub v18.4S, v16.4S, v20.4S // ...........................................................................................................................................................................*............................................................................................................ + add v15.4S, v16.4S, v20.4S // ............................................................................................................................................................................*........................................................................................................... + cmge v20.4S, v19.4S, v30.4S // .........................................................................................................................................................................................*.............................................................................................. + str q27, [x1, #576] // .................................................................................................................................................................................................................*...................................................................... + cmge v27.4S, v31.4S, v19.4S // ........................................................................................................................................................................................*............................................................................................... + mul v16.4S, v22.4S, v0.S[0] // ..........................................................................................................................................*............................................................................................................................................. + sqrdmulh v22.4S, v22.4S, v0.S[1] // ...........................................................................................................................................*............................................................................................................................................ + sub v20.4S, v27.4S, v20.4S // ..........................................................................................................................................................................................*............................................................................................. + mls v16.4S, v22.4S, v29.4S // ............................................................................................................................................*........................................................................................................................................... + cmge v22.4S, v11.4S, v30.4S // .................................................................................................................................................................................................*...................................................................................... + mls v19.4S, v20.4S, v29.4S // ...........................................................................................................................................................................................*............................................................................................ + add v27.4S, v14.4S, v23.4S // .......................................................................................................................................................................*................................................................................................................ + sub v20.4S, v14.4S, v23.4S // ......................................................................................................................................................................*................................................................................................................. + sub v23.4S, v24.4S, v22.4S // ..................................................................................................................................................................................................*..................................................................................... + cmge v14.4S, v16.4S, v30.4S // .................................................................................................................................................................................*...................................................................................................... + cmge v24.4S, v31.4S, v16.4S // ................................................................................................................................................................................*....................................................................................................... + str q19, [x1, #640] // ..................................................................................................................................................................................................................*..................................................................... + sub v22.4S, v21.4S, v17.4S // .......................................................................................................................................................*................................................................................................................................ + mls v11.4S, v23.4S, v29.4S // ...................................................................................................................................................................................................*.................................................................................... + sub v24.4S, v24.4S, v14.4S // ..................................................................................................................................................................................*..................................................................................................... + sqrdmulh v14.4S, v20.4S, v0.S[1] // .........................................................................................................................................................................*.............................................................................................................. + mul v19.4S, v22.4S, v0.S[0] // .........................................................................................................................................................*.............................................................................................................................. + sqrdmulh v23.4S, v22.4S, v0.S[1] // ..........................................................................................................................................................*............................................................................................................................. + str q11, [x1, #768] // ....................................................................................................................................................................................................................*................................................................... + add v11.4S, v21.4S, v17.4S // ........................................................................................................................................................*............................................................................................................................... + mls v16.4S, v24.4S, v29.4S // ...................................................................................................................................................................................*.................................................................................................... + mul v21.4S, v28.4S, v0.S[0] // ...................................................................................................................................................................*.................................................................................................................... + mul v24.4S, v20.4S, v0.S[0] // ........................................................................................................................................................................*............................................................................................................... + mul v20.4S, v12.4S, v25.4S // ....................................................................................................................................................................................................................................*................................................... + mls v19.4S, v23.4S, v29.4S // ...........................................................................................................................................................*............................................................................................................................ + str q16, [x1, #512] // ................................................................................................................................................................................................................*....................................................................... + mul v23.4S, v18.4S, v0.S[0] // .............................................................................................................................................................................*.......................................................................................................... + sqrdmulh v22.4S, v18.4S, v0.S[1] // ..............................................................................................................................................................................*......................................................................................................... + sqrdmulh v18.4S, v12.4S, v26.4S // .....................................................................................................................................................................................................................................*.................................................. + mls v24.4S, v14.4S, v29.4S // ..........................................................................................................................................................................*............................................................................................................. + cmge v17.4S, v31.4S, v19.4S // ............................................................................................................................................................................................*........................................................................................... + sqrdmulh v16.4S, v28.4S, v0.S[1] // ....................................................................................................................................................................*................................................................................................................... + cmge v28.4S, v19.4S, v30.4S // .............................................................................................................................................................................................*.......................................................................................... + cmge v14.4S, v24.4S, v30.4S // .........................................................................................................................................................................................................*.............................................................................. + cmge v12.4S, v31.4S, v24.4S // ........................................................................................................................................................................................................*............................................................................... + sub v28.4S, v17.4S, v28.4S // ..............................................................................................................................................................................................*......................................................................................... + mls v23.4S, v22.4S, v29.4S // ...............................................................................................................................................................................*........................................................................................................ + sub v17.4S, v12.4S, v14.4S // ..........................................................................................................................................................................................................*............................................................................. + mls v21.4S, v16.4S, v29.4S // .....................................................................................................................................................................*.................................................................................................................. + mls v19.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................*........................................................................................ + sqrdmulh v14.4S, v27.4S, v26.4S // ...........................................................................................................................................................................................................................................*............................................ + cmge v28.4S, v31.4S, v23.4S // ............................................................................................................................................................................................................*........................................................................... + cmge v12.4S, v23.4S, v30.4S // .............................................................................................................................................................................................................*.......................................................................... + cmge v16.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................*.................................................................................. + cmge v22.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................*................................................................................... + mls v24.4S, v17.4S, v29.4S // ...........................................................................................................................................................................................................*............................................................................ + mul v17.4S, v9.4S, v25.4S // ...........................................................................................................................................................................................................................*............................................................ + str q19, [x1, #704] // ...................................................................................................................................................................................................................*.................................................................... + sub v28.4S, v28.4S, v12.4S // ..............................................................................................................................................................................................................*......................................................................... + sub v12.4S, v22.4S, v16.4S // ......................................................................................................................................................................................................*................................................................................. + sqrdmulh v16.4S, v9.4S, v26.4S // ............................................................................................................................................................................................................................*........................................................... + mul v22.4S, v27.4S, v25.4S // ..........................................................................................................................................................................................................................................*............................................. + mls v21.4S, v12.4S, v29.4S // .......................................................................................................................................................................................................*................................................................................ + str q24, [x1, #896] // ......................................................................................................................................................................................................................*................................................................. + mls v23.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................*........................................................................ + mul v19.4S, v11.4S, v25.4S // .................................................................................................................................................................................................................................*...................................................... + sqrdmulh v11.4S, v11.4S, v26.4S // ..................................................................................................................................................................................................................................*..................................................... + mls v22.4S, v14.4S, v29.4S // ............................................................................................................................................................................................................................................*........................................... + mls v17.4S, v16.4S, v29.4S // .............................................................................................................................................................................................................................*.......................................................... + str q21, [x1, #832] // .....................................................................................................................................................................................................................*.................................................................. + cmge v12.4S, v17.4S, v30.4S // .....................................................................................................................................................................................................................................................*.................................. + cmge v27.4S, v31.4S, v17.4S // ....................................................................................................................................................................................................................................................*................................... + cmge v9.4S, v31.4S, v22.4S // ........................................................................................................................................................................................................................................................................*............... + cmge v24.4S, v22.4S, v30.4S // .........................................................................................................................................................................................................................................................................*.............. + str q23, [x1, #960] // .......................................................................................................................................................................................................................*................................................................ + sub v12.4S, v27.4S, v12.4S // ......................................................................................................................................................................................................................................................*................................. + sqrdmulh v27.4S, v8.4S, v26.4S // .........................................................................................................................................................................................................................*.............................................................. + sub v24.4S, v9.4S, v24.4S // ..........................................................................................................................................................................................................................................................................*............. + mul v16.4S, v8.4S, v25.4S // ........................................................................................................................................................................................................................*............................................................... + mls v17.4S, v12.4S, v29.4S // .......................................................................................................................................................................................................................................................*................................ + mls v19.4S, v11.4S, v29.4S // ...................................................................................................................................................................................................................................*.................................................... + mul v8.4S, v15.4S, v25.4S // .............................................................................................................................................................................................................................................*.......................................... + sqrdmulh v9.4S, v15.4S, v26.4S // ..............................................................................................................................................................................................................................................*......................................... + mls v16.4S, v27.4S, v29.4S // ..........................................................................................................................................................................................................................*............................................................. + mul v21.4S, v13.4S, v25.4S // .......................................................................................................................................................................................................................................*................................................ + str q17, [x1, #64] // .................................................................................................................................................................................................................................................................................*...... + sqrdmulh v13.4S, v13.4S, v26.4S // ........................................................................................................................................................................................................................................*............................................... + mul v11.4S, v10.4S, v25.4S // ..............................................................................................................................................................................................................................*......................................................... + mls v20.4S, v18.4S, v29.4S // ......................................................................................................................................................................................................................................*................................................. + mls v8.4S, v9.4S, v29.4S // ...............................................................................................................................................................................................................................................*........................................ + cmge v9.4S, v16.4S, v30.4S // .................................................................................................................................................................................................................................................*...................................... + cmge v12.4S, v31.4S, v16.4S // ................................................................................................................................................................................................................................................*....................................... + mls v21.4S, v13.4S, v29.4S // .........................................................................................................................................................................................................................................*.............................................. + sqrdmulh v13.4S, v10.4S, v26.4S // ...............................................................................................................................................................................................................................*........................................................ + sub v9.4S, v12.4S, v9.4S // ..................................................................................................................................................................................................................................................*..................................... + cmge v12.4S, v31.4S, v8.4S // ............................................................................................................................................................................................................................................................................*........... + cmge v18.4S, v19.4S, v30.4S // .............................................................................................................................................................................................................................................................*.......................... + mls v11.4S, v13.4S, v29.4S // ................................................................................................................................................................................................................................*....................................................... + mls v16.4S, v9.4S, v29.4S // ...................................................................................................................................................................................................................................................*.................................... + cmge v23.4S, v8.4S, v30.4S // .............................................................................................................................................................................................................................................................................*.......... + cmge v14.4S, v31.4S, v20.4S // ................................................................................................................................................................................................................................................................*....................... + cmge v9.4S, v20.4S, v30.4S // .................................................................................................................................................................................................................................................................*...................... + mls v22.4S, v24.4S, v29.4S // ...........................................................................................................................................................................................................................................................................*............ + sub v28.4S, v12.4S, v23.4S // ..............................................................................................................................................................................................................................................................................*......... + str q16, [x1], #(16) // ................................................................................................................................................................................................................................................................................*....... + sub v15.4S, v14.4S, v9.4S // ..................................................................................................................................................................................................................................................................*..................... + cmge v12.4S, v31.4S, v21.4S // ....................................................................................................................................................................................................................................................................*................... + mls v8.4S, v28.4S, v29.4S // ...............................................................................................................................................................................................................................................................................*........ + cmge v14.4S, v21.4S, v30.4S // .....................................................................................................................................................................................................................................................................*.................. + str q22, [x1, #368] // ......................................................................................................................................................................................................................................................................................*. + cmge v23.4S, v31.4S, v19.4S // ............................................................................................................................................................................................................................................................*........................... + cmge v28.4S, v11.4S, v30.4S // .........................................................................................................................................................................................................................................................*.............................. + sub v24.4S, v12.4S, v14.4S // ......................................................................................................................................................................................................................................................................*................. + cmge v9.4S, v31.4S, v11.4S // ........................................................................................................................................................................................................................................................*............................... + mls v20.4S, v15.4S, v29.4S // ...................................................................................................................................................................................................................................................................*.................... + sub v13.4S, v23.4S, v18.4S // ..............................................................................................................................................................................................................................................................*......................... + str q8, [x1, #432] // .......................................................................................................................................................................................................................................................................................* + mls v21.4S, v24.4S, v29.4S // .......................................................................................................................................................................................................................................................................*................ + sub v22.4S, v9.4S, v28.4S // ..........................................................................................................................................................................................................................................................*............................. + mls v19.4S, v13.4S, v29.4S // ...............................................................................................................................................................................................................................................................*........................ + mls v11.4S, v22.4S, v29.4S // ...........................................................................................................................................................................................................................................................*............................ + str q20, [x1, #240] // ....................................................................................................................................................................................................................................................................................*... + str q21, [x1, #304] // .....................................................................................................................................................................................................................................................................................*.. + str q19, [x1, #176] // ...................................................................................................................................................................................................................................................................................*.... + str q11, [x1, #112] // ..................................................................................................................................................................................................................................................................................*..... + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a55.s b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a55.s new file mode 100644 index 00000000..12c85522 --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a55.s @@ -0,0 +1,2327 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, consts +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_manual_ld4_opt_a55 + .global _intt_dilithium_123_45678_manual_ld4_opt_a55 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_manual_ld4_opt_a55: +_intt_dilithium_123_45678_manual_ld4_opt_a55: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ldr q3, [x4, #48] // ........................* + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q21, [x5, #16] // .*....................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q28, [x5, #32] // ..*...................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q30, [x1, #0] // ...*..................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q12, [x2, #16] // .........*............... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q17, [x2, #32] // ..........*.............. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q7, [x1, #32] // .....*................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q2, [x5, #96] // ...............*......... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q14, [x5, #144] // ..................*...... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q5, [x5, #160] // ...................*..... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q29, [x5, #176] // ....................*.... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q23, [x4], #64 // .....................*... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q31, [x4, #-32] // .......................*. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q25, [x5, #128] // .................*....... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q1, [x1, #48] // ......*.................. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q11, [x5, #64] // .............*........... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q9, [x4, #-48] // ......................*.. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q19, [x5, #80] // ..............*.......... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q24, [x1, #16] // ....*.................... + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q15, [x5], #(12*16) // *........................ + // gap // ......................... + // gap // ......................... + // gap // ......................... + trn1 v6.4S, v7.4S, v1.4S // ........*................ + // gap // ......................... + ldr q18, [x5, #-144] // ............*............ + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q13, [x2, #0] // .......*................. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q27, [x2, #48] // ...........*............. + // gap // ......................... + // gap // ......................... + // gap // ......................... + ldr q4, [x5, #-80] // ................*........ + // gap // ......................... + + // original source code + // ldr q15, [x5], #(12*16) // ...................*..... + // ldr q21, [x5, #-176] // .*....................... + // ldr q28, [x5, #-160] // ..*...................... + // ldr q30, [x1, #0] // ...*..................... + // ldr q24, [x1, #16] // ..................*...... + // ldr q7, [x1, #32] // ......*.................. + // ldr q1, [x1, #48] // ..............*.......... + // ldr q13, [x2, #0] // ......................*.. + // trn1 v6.4S, v7.4S, v1.4S // ....................*.... + // ldr q12, [x2, #16] // ....*.................... + // ldr q17, [x2, #32] // .....*................... + // ldr q27, [x2, #48] // .......................*. + // ldr q18, [x5, #-144] // .....................*... + // ldr q11, [x5, #-128] // ...............*......... + // ldr q19, [x5, #-112] // .................*....... + // ldr q2, [x5, #-96] // .......*................. + // ldr q4, [x5, #-80] // ........................* + // ldr q25, [x5, #-64] // .............*........... + // ldr q14, [x5, #-48] // ........*................ + // ldr q5, [x5, #-32] // .........*............... + // ldr q29, [x5, #-16] // ..........*.............. + // ldr q23, [x4], #64 // ...........*............. + // ldr q9, [x4, #-48] // ................*........ + // ldr q31, [x4, #-32] // ............*............ + // ldr q3, [x4, #-16] // *........................ + + sub count, count, #1 +layer45678_start: + trn1 v0.4S, v30.4S, v24.4S // ....*......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v10.4S, v30.4S, v24.4S // .....*........................................................................................................................................................................ + // gap // .............................................................................................................................................................................. + trn2 v30.4S, v7.4S, v1.4S // .......*...................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v24.2D, v0.2D, v6.2D // ........*..................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v0.2D, v0.2D, v6.2D // ..........*................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v7.2D, v10.2D, v30.2D // .........*.................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v10.2D, v10.2D, v30.2D // ...........*.................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v30.4S, v24.4S, v7.4S // ...................................*.......................................................................................................................................... + // gap // .............................................................................................................................................................................. + add v24.4S, v24.4S, v7.4S // ....................................*......................................................................................................................................... + // gap // .............................................................................................................................................................................. + sub v7.4S, v0.4S, v10.4S // ..............................*............................................................................................................................................... + // gap // .............................................................................................................................................................................. + add v0.4S, v0.4S, v10.4S // ...............................*.............................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v10.4S, v13.4S, v12.4S // ................*............................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v1.4S, v13.4S, v12.4S // .................*............................................................................................................................................................ + // gap // .............................................................................................................................................................................. + trn1 v13.4S, v17.4S, v27.4S // ..................*........................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v6.4S, v17.4S, v27.4S // ...................*.......................................................................................................................................................... + // gap // .............................................................................................................................................................................. + mul v12.4S, v30.4S, v11.4S // .....................................*........................................................................................................................................ + // gap // .............................................................................................................................................................................. + sqrdmulh v30.4S, v30.4S, v19.4S // ......................................*....................................................................................................................................... + // gap // .............................................................................................................................................................................. + sub v17.4S, v0.4S, v24.4S // ........................................*..................................................................................................................................... + // gap // .............................................................................................................................................................................. + add v0.4S, v0.4S, v24.4S // .........................................*.................................................................................................................................... + // gap // .............................................................................................................................................................................. + mul v24.4S, v7.4S, v28.4S // ................................*............................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v7.4S, v7.4S, v18.4S // .................................*............................................................................................................................................ + // gap // .............................................................................................................................................................................. + trn2 v28.2D, v10.2D, v13.2D // ....................*......................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v27.2D, v1.2D, v6.2D // .....................*........................................................................................................................................................ + // gap // .............................................................................................................................................................................. + trn1 v10.2D, v10.2D, v13.2D // ......................*....................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v1.2D, v1.2D, v6.2D // .......................*...................................................................................................................................................... + // gap // .............................................................................................................................................................................. + mls v24.4S, v7.4S, v8.S[0] // ..................................*........................................................................................................................................... + // gap // .............................................................................................................................................................................. + mls v12.4S, v30.4S, v8.S[0] // .......................................*...................................................................................................................................... + // gap // .............................................................................................................................................................................. + mul v30.4S, v17.4S, v15.4S // ..........................................*................................................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v7.4S, v17.4S, v21.4S // ...........................................*.................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v13.4S, v10.4S, v1.4S // ........................................................*..................................................................................................................... + // gap // .............................................................................................................................................................................. + sub v6.4S, v24.4S, v12.4S // .............................................*................................................................................................................................ + // gap // .............................................................................................................................................................................. + add v24.4S, v24.4S, v12.4S // ..............................................*............................................................................................................................... + // gap // .............................................................................................................................................................................. + mls v30.4S, v7.4S, v8.S[0] // ............................................*................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v7.4S, v6.4S, v15.4S // ...............................................*.............................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v6.4S, v6.4S, v21.4S // ................................................*............................................................................................................................. + // gap // .............................................................................................................................................................................. + add v10.4S, v10.4S, v1.4S // .........................................................*.................................................................................................................... + // gap // .............................................................................................................................................................................. + mul v1.4S, v13.4S, v25.4S // ..........................................................*................................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v12.4S, v0.4S, v24.4S // ............................................................................*................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v0.4S, v0.4S, v24.4S // .............................................................................*................................................................................................ + // gap // .............................................................................................................................................................................. + mls v7.4S, v6.4S, v8.S[0] // .................................................*............................................................................................................................ + // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v13.4S, v14.4S // ...........................................................*.................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v13.4S, v28.4S, v27.4S // .............................................................*................................................................................................................ + // gap // .............................................................................................................................................................................. + add v6.4S, v28.4S, v27.4S // ..............................................................*............................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v17.4S, v30.4S, v7.4S // ..............................................................................*............................................................................................... + // gap // .............................................................................................................................................................................. + mls v1.4S, v24.4S, v8.S[0] // ............................................................*................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v24.4S, v13.4S, v5.4S // ...............................................................*.............................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v29.4S // ................................................................*............................................................................................................. + // gap // .............................................................................................................................................................................. + sub v28.4S, v10.4S, v6.4S // ..................................................................*........................................................................................................... + // gap // .............................................................................................................................................................................. + add v10.4S, v10.4S, v6.4S // ...................................................................*.......................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v30.4S, v30.4S, v7.4S // ...............................................................................*.............................................................................................. + // gap // .............................................................................................................................................................................. + mls v24.4S, v13.4S, v8.S[0] // .................................................................*............................................................................................................ + // gap // .............................................................................................................................................................................. + mul v7.4S, v28.4S, v2.4S // ....................................................................*......................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v28.4S, v4.4S // .....................................................................*........................................................................................................ + // gap // .............................................................................................................................................................................. + trn2 v6.2D, v12.2D, v17.2D // ................................................................................*............................................................................................. + // gap // .............................................................................................................................................................................. + sub v28.4S, v1.4S, v24.4S // .......................................................................*...................................................................................................... + // gap // .............................................................................................................................................................................. + add v24.4S, v1.4S, v24.4S // ........................................................................*..................................................................................................... + // gap // .............................................................................................................................................................................. + mls v7.4S, v13.4S, v8.S[0] // ......................................................................*....................................................................................................... + // gap // .............................................................................................................................................................................. + mul v1.4S, v28.4S, v2.4S // .........................................................................*.................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v28.4S, v4.4S // ..........................................................................*................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v28.2D, v0.2D, v30.2D // .................................................................................*............................................................................................ + // gap // .............................................................................................................................................................................. + trn1 v12.2D, v12.2D, v17.2D // ..................................................................................*........................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v0.2D, v0.2D, v30.2D // ...................................................................................*.......................................................................................... + // gap // .............................................................................................................................................................................. + mls v1.4S, v13.4S, v8.S[0] // ...........................................................................*.................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v30.4S, v10.4S, v24.4S // ....................................................................................*......................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v10.4S, v10.4S, v24.4S // .....................................................................................*........................................................................................ + // gap // .............................................................................................................................................................................. + sub v24.4S, v12.4S, v0.4S // ................................................................................................*............................................................................. + // gap // .............................................................................................................................................................................. + trn1 v13.4S, v7.4S, v1.4S // ......................................................................................*....................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v7.4S, v7.4S, v1.4S // .......................................................................................*...................................................................................... + // gap // .............................................................................................................................................................................. + add v0.4S, v12.4S, v0.4S // .................................................................................................*............................................................................ + // gap // .............................................................................................................................................................................. + trn2 v1.2D, v30.2D, v13.2D // ........................................................................................*..................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v12.2D, v10.2D, v7.2D // .........................................................................................*.................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v30.2D, v30.2D, v13.2D // ..........................................................................................*................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v10.2D, v10.2D, v7.2D // ...........................................................................................*.................................................................................. + // gap // .............................................................................................................................................................................. + mul v7.4S, v24.4S, v9.S[2] // ..................................................................................................*........................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v24.4S, v9.S[3] // ...................................................................................................*.......................................................................... + // gap // .............................................................................................................................................................................. + sub v13.4S, v6.4S, v28.4S // .....................................................................................................*........................................................................ + // gap // .............................................................................................................................................................................. + add v6.4S, v6.4S, v28.4S // ......................................................................................................*....................................................................... + // gap // .............................................................................................................................................................................. + sub v17.4S, v30.4S, v10.4S // ..........................................................................................................*................................................................... + // gap // .............................................................................................................................................................................. + mls v7.4S, v24.4S, v8.S[0] // ....................................................................................................*......................................................................... + // gap // .............................................................................................................................................................................. + mul v24.4S, v13.4S, v31.S[0] // .......................................................................................................*...................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v31.S[1] // ........................................................................................................*..................................................................... + // gap // .............................................................................................................................................................................. + add v10.4S, v30.4S, v10.4S // ...........................................................................................................*.................................................................. + // gap // .............................................................................................................................................................................. + mul v30.4S, v17.4S, v31.S[2] // ............................................................................................................*................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v17.4S, v17.4S, v31.S[3] // .............................................................................................................*................................................................ + // gap // .............................................................................................................................................................................. + mls v24.4S, v13.4S, v8.S[0] // .........................................................................................................*.................................................................... + // gap // .............................................................................................................................................................................. + sub v13.4S, v1.4S, v12.4S // ...............................................................................................................*.............................................................. + // gap // .............................................................................................................................................................................. + add v1.4S, v1.4S, v12.4S // ................................................................................................................*............................................................. + // gap // .............................................................................................................................................................................. + mls v30.4S, v17.4S, v8.S[0] // ..............................................................................................................*............................................................... + // gap // .............................................................................................................................................................................. + mul v12.4S, v13.4S, v3.S[0] // .................................................................................................................*............................................................ + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v3.S[1] // ..................................................................................................................*........................................................... + // gap // .............................................................................................................................................................................. + sub v17.4S, v0.4S, v6.4S // ....................................................................................................................*......................................................... + // gap // .............................................................................................................................................................................. + add v0.4S, v0.4S, v6.4S // .....................................................................................................................*........................................................ + // gap // .............................................................................................................................................................................. + sub v6.4S, v7.4S, v24.4S // .........................................................................................................................*.................................................... + // gap // .............................................................................................................................................................................. + mls v12.4S, v13.4S, v8.S[0] // ...................................................................................................................*.......................................................... + // gap // .............................................................................................................................................................................. + mul v13.4S, v17.4S, v23.S[2] // ......................................................................................................................*....................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v17.4S, v17.4S, v23.S[3] // .......................................................................................................................*...................................................... + // gap // .............................................................................................................................................................................. + add v24.4S, v7.4S, v24.4S // ..........................................................................................................................*................................................... + // gap // .............................................................................................................................................................................. + mul v7.4S, v6.4S, v23.S[2] // ...........................................................................................................................*.................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v6.4S, v6.4S, v23.S[3] // ............................................................................................................................*................................................. + // gap // .............................................................................................................................................................................. + mls v13.4S, v17.4S, v8.S[0] // ........................................................................................................................*..................................................... + // gap // .............................................................................................................................................................................. + sub v17.4S, v10.4S, v1.4S // ..............................................................................................................................*............................................... + // gap // .............................................................................................................................................................................. + add v10.4S, v10.4S, v1.4S // ...............................................................................................................................*.............................................. + // gap // .............................................................................................................................................................................. + mls v7.4S, v6.4S, v8.S[0] // .............................................................................................................................*................................................ + // gap // .............................................................................................................................................................................. + mul v1.4S, v17.4S, v9.S[0] // ................................................................................................................................*............................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v6.4S, v17.4S, v9.S[1] // .................................................................................................................................*............................................ + // gap // .............................................................................................................................................................................. + sub v17.4S, v30.4S, v12.4S // ...................................................................................................................................*.......................................... + // gap // .............................................................................................................................................................................. + add v30.4S, v30.4S, v12.4S // ....................................................................................................................................*......................................... + // gap // .............................................................................................................................................................................. + srshr v12.4S, v0.4S, #23 // ........................................................................................................................................*..................................... + // gap // .............................................................................................................................................................................. + mls v1.4S, v6.4S, v8.S[0] // ..................................................................................................................................*........................................... + // gap // .............................................................................................................................................................................. + mul v6.4S, v17.4S, v9.S[0] // .....................................................................................................................................*........................................ + // gap // .............................................................................................................................................................................. + sqrdmulh v17.4S, v17.4S, v9.S[1] // ......................................................................................................................................*....................................... + // gap // .............................................................................................................................................................................. + mls v0.4S, v12.4S, v8.4S // .........................................................................................................................................*.................................... + // gap // .............................................................................................................................................................................. + srshr v12.4S, v24.4S, #23 // ..........................................................................................................................................*................................... + // gap // .............................................................................................................................................................................. + srshr v28.4S, v10.4S, #23 // ............................................................................................................................................*................................. + // gap // .............................................................................................................................................................................. + mls v6.4S, v17.4S, v8.S[0] // .......................................................................................................................................*...................................... + // gap // .............................................................................................................................................................................. + mls v24.4S, v12.4S, v8.4S // ...........................................................................................................................................*.................................. + // gap // .............................................................................................................................................................................. + mls v10.4S, v28.4S, v8.4S // .............................................................................................................................................*................................ + // gap // .............................................................................................................................................................................. + srshr v12.4S, v30.4S, #23 // ..............................................................................................................................................*............................... + // gap // .............................................................................................................................................................................. + sub v17.4S, v13.4S, v1.4S // ..........................................................................................................................................................*................... + // gap // .............................................................................................................................................................................. + add v1.4S, v13.4S, v1.4S // ...........................................................................................................................................................*.................. + // gap // .............................................................................................................................................................................. + mls v30.4S, v12.4S, v8.4S // ...............................................................................................................................................*.............................. + // gap // .............................................................................................................................................................................. + sub v13.4S, v0.4S, v10.4S // ................................................................................................................................................*............................. + // gap // .............................................................................................................................................................................. + add v0.4S, v0.4S, v10.4S // .................................................................................................................................................*............................ + // gap // .............................................................................................................................................................................. + mul v10.4S, v17.4S, v23.S[0] // ............................................................................................................................................................*................. + // gap // .............................................................................................................................................................................. + mul v12.4S, v13.4S, v23.S[0] // ..................................................................................................................................................*........................... + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v13.4S, v23.S[1] // ...................................................................................................................................................*.......................... + // gap // .............................................................................................................................................................................. + sub v28.4S, v24.4S, v30.4S // .....................................................................................................................................................*........................ + // gap // .............................................................................................................................................................................. + add v30.4S, v24.4S, v30.4S // ......................................................................................................................................................*....................... + // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v17.4S, v23.S[1] // .............................................................................................................................................................*................ + // gap // .............................................................................................................................................................................. + mls v12.4S, v13.4S, v8.S[0] // ....................................................................................................................................................*......................... + // gap // .............................................................................................................................................................................. + mul v13.4S, v28.4S, v23.S[0] // .......................................................................................................................................................*...................... + // gap // .............................................................................................................................................................................. + sqrdmulh v17.4S, v28.4S, v23.S[1] // ........................................................................................................................................................*..................... + // gap // .............................................................................................................................................................................. + mls v10.4S, v24.4S, v8.S[0] // ..............................................................................................................................................................*............... + // gap // .............................................................................................................................................................................. + sub v24.4S, v7.4S, v6.4S // ...............................................................................................................................................................*.............. + // gap // .............................................................................................................................................................................. + add v7.4S, v7.4S, v6.4S // ................................................................................................................................................................*............. + // gap // .............................................................................................................................................................................. + mls v13.4S, v17.4S, v8.S[0] // .........................................................................................................................................................*.................... + // gap // .............................................................................................................................................................................. + mul v6.4S, v24.4S, v23.S[0] // .................................................................................................................................................................*............ + // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v24.4S, v23.S[1] // ..................................................................................................................................................................*........... + // gap // .............................................................................................................................................................................. + str q0, [x1], #(16*4) // ....................................................................................................................................................................*......... + // gap // .............................................................................................................................................................................. + ldr q15, [x5], #(12*16) // ........................e..................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v6.4S, v24.4S, v8.S[0] // ...................................................................................................................................................................*.......... + // gap // .............................................................................................................................................................................. + str q30, [x1, #-48] // .....................................................................................................................................................................*........ + // gap // .............................................................................................................................................................................. + ldr q21, [x5, #-176] // .........................e.................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q1, [x1, #-32] // ......................................................................................................................................................................*....... + // gap // .............................................................................................................................................................................. + ldr q28, [x5, #-160] // ..........................e................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q7, [x1, #-16] // .......................................................................................................................................................................*...... + add x1, x1, #64 // ............................................................................................................................................................................*. + ldr q30, [x1, #0] // e............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q12, [x2], #(16*4) // ........................................................................................................................................................................*..... + // gap // .............................................................................................................................................................................. + ldr q24, [x1, #16] // .e............................................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q13, [x2, #-48] // .........................................................................................................................................................................*.... + // gap // .............................................................................................................................................................................. + ldr q7, [x1, #32] // ..e........................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q10, [x2, #-32] // ..........................................................................................................................................................................*... + // gap // .............................................................................................................................................................................. + ldr q1, [x1, #48] // ...e.......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q6, [x2, #-16] // ...........................................................................................................................................................................*.. + add x2, x2, #64 // .............................................................................................................................................................................* + ldr q13, [x2, #0] // ............e................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v6.4S, v7.4S, v1.4S // ......e....................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + ldr q12, [x2, #16] // .............e................................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q17, [x2, #32] // ..............e............................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q27, [x2, #48] // ...............e.............................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q18, [x5, #-144] // ...........................e.................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q11, [x5, #-128] // ............................e................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q19, [x5, #-112] // .............................e................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q2, [x5, #-96] // ..................................................e........................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q4, [x5, #-80] // ...................................................e.......................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q25, [x5, #-64] // ....................................................e......................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q14, [x5, #-48] // .....................................................e........................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q5, [x5, #-32] // ......................................................e....................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q29, [x5, #-16] // .......................................................e...................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q23, [x4], #64 // ............................................................................................e................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q9, [x4, #-48] // .............................................................................................e................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q31, [x4, #-32] // ..............................................................................................e............................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q3, [x4, #-16] // ...............................................................................................e.............................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + + // original source code + // ldr q9, [x1, #0] // ........e..........................|..................................................................................................................................................e........ + // ldr q10, [x1, #16] // ..........e........................|....................................................................................................................................................e...... + // ldr q11, [x1, #32] // ............e......................|......................................................................................................................................................e.... + // ldr q12, [x1, #48] // ..............e....................|........................................................................................................................................................e.. + // trn1 v25.4s, v9.4s, v10.4s // ...................................*........................................................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ...................................|*.......................................................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ..................e................|........................................................................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ...................................|.*......................................................................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ...................................|..*........................................................................................................................................................ + // trn2 v12.2d, v26.2d, v28.2d // ...................................|....*...................................................................................................................................................... + // trn1 v9.2d, v25.2d, v27.2d // ...................................|...*....................................................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...................................|.....*..................................................................................................................................................... + // ldr q13, [x2, #0] // .................e.................|........................................................................................................................................................... + // ldr q14, [x2, #16] // ...................e...............|........................................................................................................................................................... + // ldr q15, [x2, #32] // ....................e..............|........................................................................................................................................................... + // ldr q16, [x2, #48] // .....................e.............|........................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ...................................|..........*................................................................................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ...................................|...........*............................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...................................|............*.............................................................................................................................................. + // trn2 v28.4s, v15.4s, v16.4s // ...................................|.............*............................................................................................................................................. + // trn2 v15.2d, v25.2d, v27.2d // ...................................|....................*...................................................................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ...................................|.....................*..................................................................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................|......................*.................................................................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ...................................|.......................*................................................................................................................................... + // ldr q0, [x5], #(12*16) // e..................................|..........................................................................................................................................e................ + // ldr q4, [x5, #(-12*16 + 1*16)] // ...e...............................|.............................................................................................................................................e............. + // ldr q1, [x5, #(-12*16 + 2*16)] // .....e.............................|...............................................................................................................................................e........... + // ldr q5, [x5, #(-12*16 + 3*16)] // ......................e............|........................................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 4*16)] // .......................e...........|........................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ........................e..........|........................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ...................................|........*.................................................................................................................................................. + // add v9.4s, v9.4s, v10.4s // ...................................|.........*................................................................................................................................................. + // mul v10.4s, v24.4s, v1.4s // ...................................|..................*........................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ...................................|...................*....................................................................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ...................................|........................*.................................................................................................................................. + // sub v24.4s, v11.4s, v12.4s // ...................................|......*.................................................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................|.......*................................................................................................................................................... + // mul v12.4s, v24.4s, v2.4s // ...................................|..............*............................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................|...............*........................................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................|.........................*................................................................................................................................. + // sub v24.4s, v9.4s, v11.4s // ...................................|................*.......................................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...................................|.................*......................................................................................................................................... + // mul v11.4s, v24.4s, v0.4s // ...................................|..........................*................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|...........................*............................................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ...................................|...............................*........................................................................................................................... + // sub v24.4s, v10.4s, v12.4s // ...................................|.............................*............................................................................................................................. + // add v10.4s, v10.4s, v12.4s // ...................................|..............................*............................................................................................................................ + // mul v12.4s, v24.4s, v0.4s // ...................................|................................*.......................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|.................................*......................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................|......................................*.................................................................................................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // .........................e.........|........................................................................................................................................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..........................e........|........................................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ...........................e.......|........................................................................................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ............................e......|........................................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 10*16)] // .............................e.....|........................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................e....|........................................................................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ...................................|............................*.............................................................................................................................. + // add v13.4s, v13.4s, v14.4s // ...................................|..................................*........................................................................................................................ + // mul v14.4s, v24.4s, v1.4s // ...................................|...................................*....................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ...................................|.......................................*................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ...................................|...........................................*............................................................................................................... + // sub v24.4s, v15.4s, v16.4s // ...................................|........................................*.................................................................................................................. + // add v15.4s, v15.4s, v16.4s // ...................................|.........................................*................................................................................................................. + // mul v16.4s, v24.4s, v2.4s // ...................................|............................................*.............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ...................................|.............................................*............................................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ...................................|.................................................*......................................................................................................... + // sub v24.4s, v13.4s, v15.4s // ...................................|..............................................*............................................................................................................ + // add v13.4s, v13.4s, v15.4s // ...................................|...............................................*........................................................................................................... + // mul v15.4s, v24.4s, v0.4s // ...................................|..................................................*........................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|...................................................*....................................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ...................................|.......................................................*................................................................................................... + // sub v24.4s, v14.4s, v16.4s // ...................................|.....................................................*..................................................................................................... + // add v14.4s, v14.4s, v16.4s // ...................................|......................................................*.................................................................................................... + // mul v16.4s, v24.4s, v0.4s // ...................................|........................................................*.................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................|.........................................................*................................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ...................................|.............................................................*............................................................................................. + // trn1 v25.4s, v9.4s, v10.4s // ...................................|....................................*...................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ...................................|.....................................*..................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ...................................|..........................................*................................................................................................................ + // trn2 v28.4s, v11.4s, v12.4s // ...................................|................................................*.......................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ...................................|....................................................*...................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ...................................|..........................................................*................................................................................................ + // trn1 v9.2d, v25.2d, v27.2d // ...................................|...........................................................*............................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ...................................|............................................................*.............................................................................................. + // trn1 v25.4s, v13.4s, v14.4s // ...................................|..............................................................*............................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ...................................|...............................................................*........................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...................................|.................................................................*......................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ...................................|..................................................................*........................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ...................................|....................................................................*...................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ...................................|.....................................................................*..................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................|......................................................................*.................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ...................................|.......................................................................*................................................................................... + // ldr q0, [x4], #64 // ...............................e...|........................................................................................................................................................... + // ldr q1, [x4, #(-64 + 16)] // ................................e..|........................................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // .................................e.|........................................................................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ..................................e|........................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ...................................|................................................................*.......................................................................................... + // add v9.4s, v9.4s, v10.4s // ...................................|...................................................................*....................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ...................................|........................................................................*.................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...................................|.........................................................................*................................................................................. + // mls v10.4s, v24.4s, v8.s[0] // ...................................|.............................................................................*............................................................................. + // sub v24.4s, v11.4s, v12.4s // ...................................|..........................................................................*................................................................................ + // add v11.4s, v11.4s, v12.4s // ...................................|...........................................................................*............................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ...................................|..............................................................................*............................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...................................|...............................................................................*........................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................|...................................................................................*....................................................................... + // sub v24.4s, v13.4s, v14.4s // ...................................|............................................................................*.............................................................................. + // add v13.4s, v13.4s, v14.4s // ...................................|................................................................................*.......................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ...................................|.................................................................................*......................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................|..................................................................................*........................................................................ + // mls v14.4s, v24.4s, v8.s[0] // ...................................|......................................................................................*.................................................................... + // sub v24.4s, v15.4s, v16.4s // ...................................|....................................................................................*...................................................................... + // add v15.4s, v15.4s, v16.4s // ...................................|.....................................................................................*..................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ...................................|.......................................................................................*................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...................................|........................................................................................*.................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ...................................|............................................................................................*.............................................................. + // sub v24.4s, v9.4s, v11.4s // ...................................|.........................................................................................*................................................................. + // add v9.4s, v9.4s, v11.4s // ...................................|..........................................................................................*................................................................ + // mul v11.4s, v24.4s, v0.s[2] // ...................................|.............................................................................................*............................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................|..............................................................................................*............................................................ + // mls v11.4s, v24.4s, v8.s[0] // ...................................|..................................................................................................*........................................................ + // sub v24.4s, v10.4s, v12.4s // ...................................|...........................................................................................*............................................................... + // add v10.4s, v10.4s, v12.4s // ...................................|...............................................................................................*........................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...................................|................................................................................................*.......................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................|.................................................................................................*......................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................|.....................................................................................................*..................................................... + // sub v24.4s, v13.4s, v15.4s // ...................................|...................................................................................................*....................................................... + // add v13.4s, v13.4s, v15.4s // ...................................|....................................................................................................*...................................................... + // mul v15.4s, v24.4s, v1.s[0] // ...................................|......................................................................................................*.................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................|.......................................................................................................*................................................... + // mls v15.4s, v24.4s, v8.s[0] // ...................................|...........................................................................................................*............................................... + // sub v24.4s, v14.4s, v16.4s // ...................................|........................................................................................................*.................................................. + // add v14.4s, v14.4s, v16.4s // ...................................|.........................................................................................................*................................................. + // mul v16.4s, v24.4s, v1.s[0] // ...................................|............................................................................................................*.............................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................|.............................................................................................................*............................................. + // mls v16.4s, v24.4s, v8.s[0] // ...................................|.................................................................................................................*......................................... + // srshr v24.4S, v9.4S, #23 // ...................................|..........................................................................................................*................................................ + // mls v9.4s, v24.4s, v8.4s // ...................................|..............................................................................................................*............................................ + // srshr v24.4S, v10.4S, #23 // ...................................|...............................................................................................................*........................................... + // mls v10.4s, v24.4s, v8.4s // ...................................|..................................................................................................................*........................................ + // srshr v24.4S, v13.4S, #23 // ...................................|................................................................................................................*.......................................... + // mls v13.4s, v24.4s, v8.4s // ...................................|...................................................................................................................*....................................... + // srshr v24.4S, v14.4S, #23 // ...................................|....................................................................................................................*...................................... + // mls v14.4s, v24.4s, v8.4s // ...................................|.......................................................................................................................*................................... + // sub v24.4s, v9.4s, v13.4s // ...................................|........................................................................................................................*.................................. + // add v9.4s, v9.4s, v13.4s // ...................................|.........................................................................................................................*................................. + // mul v13.4s, v24.4s, v0.s[0] // ...................................|...........................................................................................................................*............................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|............................................................................................................................*.............................. + // mls v13.4s, v24.4s, v8.s[0] // ...................................|................................................................................................................................*.......................... + // sub v24.4s, v10.4s, v14.4s // ...................................|.............................................................................................................................*............................. + // add v10.4s, v10.4s, v14.4s // ...................................|..............................................................................................................................*............................ + // mul v14.4s, v24.4s, v0.s[0] // ...................................|.................................................................................................................................*......................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|..................................................................................................................................*........................ + // mls v14.4s, v24.4s, v8.s[0] // ...................................|......................................................................................................................................*.................... + // sub v24.4s, v11.4s, v15.4s // ...................................|.....................................................................................................................*..................................... + // add v11.4s, v11.4s, v15.4s // ...................................|......................................................................................................................*.................................... + // mul v15.4s, v24.4s, v0.s[0] // ...................................|..........................................................................................................................*................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|...............................................................................................................................*........................... + // mls v15.4s, v24.4s, v8.s[0] // ...................................|...................................................................................................................................*....................... + // sub v24.4s, v12.4s, v16.4s // ...................................|....................................................................................................................................*...................... + // add v12.4s, v12.4s, v16.4s // ...................................|.....................................................................................................................................*..................... + // mul v16.4s, v24.4s, v0.s[0] // ...................................|.......................................................................................................................................*................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................|........................................................................................................................................*.................. + // mls v16.4s, v24.4s, v8.s[0] // .*.................................|...........................................................................................................................................*............... + // str q9, [x1], #(16*4) // ...................................|.........................................................................................................................................*................. + // str q10, [x1, #(-16*4 + 1*16)] // ..*................................|............................................................................................................................................*.............. + // str q11, [x1, #(-16*4 + 2*16)] // ....*..............................|..............................................................................................................................................*............ + // str q12, [x1, #(-16*4 + 3*16)] // ......*............................|................................................................................................................................................*.......... + // str q13, [x2], #(16*4) // .........*.........................|...................................................................................................................................................*....... + // str q14, [x2, #(-16*4 + 1*16)] // ...........*.......................|.....................................................................................................................................................*..... + // str q15, [x2, #(-16*4 + 2*16)] // .............*.....................|.......................................................................................................................................................*... + // str q16, [x2, #(-16*4 + 3*16)] // ...............*...................|.........................................................................................................................................................*. + // add x1, x1, #64 // .......*...........................|.................................................................................................................................................*......... + // add x2, x2, #64 // ................*..................|..........................................................................................................................................................* + + sub count, count, #1 + cbnz count, layer45678_start + trn1 v0.4S, v30.4S, v24.4S // *.................................................................................................................................................... + // gap // ..................................................................................................................................................... + trn2 v10.4S, v30.4S, v24.4S // .*................................................................................................................................................... + // gap // ..................................................................................................................................................... + trn2 v7.4S, v7.4S, v1.4S // ..*.................................................................................................................................................. + // gap // ..................................................................................................................................................... + trn2 v20.2D, v0.2D, v6.2D // ...*................................................................................................................................................. + // gap // ..................................................................................................................................................... + trn1 v22.2D, v0.2D, v6.2D // ....*................................................................................................................................................ + // gap // ..................................................................................................................................................... + trn1 v24.2D, v10.2D, v7.2D // ......*.............................................................................................................................................. + // gap // ..................................................................................................................................................... + trn2 v10.2D, v10.2D, v7.2D // .....*............................................................................................................................................... + // gap // ..................................................................................................................................................... + sub v1.4S, v22.4S, v24.4S // .........*........................................................................................................................................... + // gap // ..................................................................................................................................................... + sub v30.4S, v20.4S, v10.4S // .......*............................................................................................................................................. + // gap // ..................................................................................................................................................... + add v7.4S, v20.4S, v10.4S // ........*............................................................................................................................................ + // gap // ..................................................................................................................................................... + sqrdmulh v10.4S, v1.4S, v18.4S // ....................*................................................................................................................................ + // gap // ..................................................................................................................................................... + mul v26.4S, v30.4S, v11.4S // ...............*..................................................................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v6.4S, v30.4S, v19.4S // ................*.................................................................................................................................... + // gap // ..................................................................................................................................................... + mul v18.4S, v1.4S, v28.4S // ...................*................................................................................................................................. + // gap // ..................................................................................................................................................... + trn2 v16.4S, v13.4S, v12.4S // ............*........................................................................................................................................ + // gap // ..................................................................................................................................................... + add v0.4S, v22.4S, v24.4S // ..........*.......................................................................................................................................... + // gap // ..................................................................................................................................................... + mls v26.4S, v6.4S, v8.S[0] // ..........................*.......................................................................................................................... + // gap // ..................................................................................................................................................... + mls v18.4S, v10.4S, v8.S[0] // .........................*........................................................................................................................... + // gap // ..................................................................................................................................................... + trn2 v28.4S, v17.4S, v27.4S // ..............*...................................................................................................................................... + // gap // ..................................................................................................................................................... + sub v11.4S, v0.4S, v7.4S // .................*................................................................................................................................... + // gap // ..................................................................................................................................................... + trn1 v30.4S, v17.4S, v27.4S // .............*....................................................................................................................................... + // gap // ..................................................................................................................................................... + sub v1.4S, v18.4S, v26.4S // ..............................*...................................................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v19.4S, v11.4S, v21.4S // ............................*........................................................................................................................ + // gap // ..................................................................................................................................................... + mul v24.4S, v11.4S, v15.4S // ...........................*......................................................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v17.4S, v1.4S, v21.4S // ..................................*.................................................................................................................. + // gap // ..................................................................................................................................................... + mul v6.4S, v1.4S, v15.4S // .................................*................................................................................................................... + // gap // ..................................................................................................................................................... + add v1.4S, v18.4S, v26.4S // ...............................*..................................................................................................................... + // gap // ..................................................................................................................................................... + add v27.4S, v0.4S, v7.4S // ..................*.................................................................................................................................. + // gap // ..................................................................................................................................................... + mls v24.4S, v19.4S, v8.S[0] // ................................*.................................................................................................................... + // gap // ..................................................................................................................................................... + mls v6.4S, v17.4S, v8.S[0] // .......................................*............................................................................................................. + // gap // ..................................................................................................................................................... + trn1 v15.4S, v27.4S, v1.4S // .....................................*............................................................................................................... + // gap // ..................................................................................................................................................... + trn2 v11.2D, v16.2D, v28.2D // ......................*.............................................................................................................................. + // gap // ..................................................................................................................................................... + trn2 v7.4S, v27.4S, v1.4S // ......................................*.............................................................................................................. + // gap // ..................................................................................................................................................... + trn2 v10.4S, v24.4S, v6.4S // .................................................*................................................................................................... + // gap // ..................................................................................................................................................... + trn1 v24.4S, v24.4S, v6.4S // ...........................................*......................................................................................................... + // gap // ..................................................................................................................................................... + trn1 v1.4S, v13.4S, v12.4S // ...........*......................................................................................................................................... + // gap // ..................................................................................................................................................... + trn1 v0.2D, v7.2D, v10.2D // .............................................................*....................................................................................... + // gap // ..................................................................................................................................................... + trn2 v22.2D, v7.2D, v10.2D // ...........................................................*......................................................................................... + // gap // ..................................................................................................................................................... + trn2 v27.2D, v15.2D, v24.2D // .....................................................*............................................................................................... + // gap // ..................................................................................................................................................... + trn1 v7.2D, v15.2D, v24.2D // ............................................................*........................................................................................ + // gap // ..................................................................................................................................................... + add v6.4S, v27.4S, v22.4S // ............................................................................*........................................................................ + // gap // ..................................................................................................................................................... + add v18.4S, v7.4S, v0.4S // ....................................................................*................................................................................ + // gap // ..................................................................................................................................................... + trn1 v28.2D, v16.2D, v28.2D // ........................*............................................................................................................................ + // gap // ..................................................................................................................................................... + trn1 v15.2D, v1.2D, v30.2D // .......................*............................................................................................................................. + // gap // ..................................................................................................................................................... + add v17.4S, v18.4S, v6.4S // ...........................................................................................*......................................................... + // gap // ..................................................................................................................................................... + sub v10.4S, v15.4S, v28.4S // .............................*....................................................................................................................... + // gap // ..................................................................................................................................................... + trn2 v13.2D, v1.2D, v30.2D // .....................*............................................................................................................................... + // gap // ..................................................................................................................................................... + srshr v26.4S, v17.4S, #23 // ...........................................................................................................*......................................... + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v10.4S, v14.4S // ........................................*............................................................................................................ + // gap // ..................................................................................................................................................... + mul v21.4S, v10.4S, v25.4S // ....................................*................................................................................................................ + // gap // ..................................................................................................................................................... + mls v17.4S, v26.4S, v8.4S // ...............................................................................................................*..................................... + // gap // ..................................................................................................................................................... + sub v10.4S, v13.4S, v11.4S // .........................................*........................................................................................................... + // gap // ..................................................................................................................................................... + sub v1.4S, v7.4S, v0.4S // .................................................................*................................................................................... + // gap // ..................................................................................................................................................... + mls v21.4S, v30.4S, v8.S[0] // ............................................*........................................................................................................ + // gap // ..................................................................................................................................................... + mul v7.4S, v10.4S, v5.4S // .............................................*....................................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v24.4S, v10.4S, v29.4S // ..............................................*...................................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v10.4S, v1.4S, v9.S[3] // ..........................................................................*.......................................................................... + // gap // ..................................................................................................................................................... + add v30.4S, v13.4S, v11.4S // ..........................................*.......................................................................................................... + // gap // ..................................................................................................................................................... + add v13.4S, v15.4S, v28.4S // ...................................*................................................................................................................. + // gap // ..................................................................................................................................................... + mls v7.4S, v24.4S, v8.S[0] // ..................................................*.................................................................................................. + // gap // ..................................................................................................................................................... + sub v0.4S, v27.4S, v22.4S // ...........................................................................*......................................................................... + // gap // ..................................................................................................................................................... + sub v24.4S, v13.4S, v30.4S // ...............................................*..................................................................................................... + // gap // ..................................................................................................................................................... + add v15.4S, v13.4S, v30.4S // ................................................*.................................................................................................... + // gap // ..................................................................................................................................................... + sub v12.4S, v21.4S, v7.4S // ......................................................*.............................................................................................. + // gap // ..................................................................................................................................................... + sqrdmulh v13.4S, v24.4S, v4.4S // ....................................................*................................................................................................ + // gap // ..................................................................................................................................................... + mul v27.4S, v24.4S, v2.4S // ...................................................*................................................................................................. + // gap // ..................................................................................................................................................... + sqrdmulh v14.4S, v12.4S, v4.4S // ..........................................................*.......................................................................................... + // gap // ..................................................................................................................................................... + mul v28.4S, v12.4S, v2.4S // .........................................................*........................................................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v0.4S, v31.S[1] // ................................................................................*.................................................................... + // gap // ..................................................................................................................................................... + add v24.4S, v21.4S, v7.4S // .......................................................*............................................................................................. + // gap // ..................................................................................................................................................... + mls v27.4S, v13.4S, v8.S[0] // ........................................................*............................................................................................ + // gap // ..................................................................................................................................................... + mls v28.4S, v14.4S, v8.S[0] // ..............................................................*...................................................................................... + // gap // ..................................................................................................................................................... + trn1 v13.4S, v15.4S, v24.4S // ...............................................................*..................................................................................... + // gap // ..................................................................................................................................................... + mul v7.4S, v0.4S, v31.S[0] // ...............................................................................*..................................................................... + // gap // ..................................................................................................................................................... + trn2 v24.4S, v15.4S, v24.4S // ................................................................*.................................................................................... + // gap // ..................................................................................................................................................... + trn2 v0.4S, v27.4S, v28.4S // ...................................................................*................................................................................. + // gap // ..................................................................................................................................................... + trn1 v15.4S, v27.4S, v28.4S // ..................................................................*.................................................................................. + // gap // ..................................................................................................................................................... + mls v7.4S, v30.4S, v8.S[0] // ....................................................................................*................................................................ + // gap // ..................................................................................................................................................... + trn1 v12.2D, v24.2D, v0.2D // ........................................................................*............................................................................ + // gap // ..................................................................................................................................................... + trn2 v28.2D, v24.2D, v0.2D // ......................................................................*.............................................................................. + // gap // ..................................................................................................................................................... + trn2 v21.2D, v13.2D, v15.2D // .....................................................................*............................................................................... + // gap // ..................................................................................................................................................... + trn1 v11.2D, v13.2D, v15.2D // .......................................................................*............................................................................. + // gap // ..................................................................................................................................................... + sub v30.4S, v21.4S, v28.4S // .....................................................................................*............................................................... + // gap // ..................................................................................................................................................... + sub v24.4S, v11.4S, v12.4S // .............................................................................*....................................................................... + // gap // ..................................................................................................................................................... + mul v15.4S, v1.4S, v9.S[2] // .........................................................................*........................................................................... + // gap // ..................................................................................................................................................... + mul v13.4S, v30.4S, v3.S[0] // ........................................................................................*............................................................ + // gap // ..................................................................................................................................................... + sqrdmulh v0.4S, v24.4S, v31.S[3] // ...................................................................................*................................................................. + // gap // ..................................................................................................................................................... + mul v1.4S, v24.4S, v31.S[2] // ..................................................................................*.................................................................. + // gap // ..................................................................................................................................................... + mls v15.4S, v10.4S, v8.S[0] // ..............................................................................*...................................................................... + // gap // ..................................................................................................................................................... + add v27.4S, v21.4S, v28.4S // ......................................................................................*.............................................................. + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v30.4S, v3.S[1] // .........................................................................................*........................................................... + // gap // ..................................................................................................................................................... + mls v1.4S, v0.4S, v8.S[0] // .......................................................................................*............................................................. + // gap // ..................................................................................................................................................... + sub v10.4S, v15.4S, v7.4S // ............................................................................................*........................................................ + // gap // ..................................................................................................................................................... + sub v0.4S, v18.4S, v6.4S // ..........................................................................................*.......................................................... + // gap // ..................................................................................................................................................... + add v28.4S, v11.4S, v12.4S // .................................................................................*................................................................... + // gap // ..................................................................................................................................................... + mls v13.4S, v30.4S, v8.S[0] // .............................................................................................*....................................................... + // gap // ..................................................................................................................................................... + mul v6.4S, v0.4S, v23.S[2] // ..............................................................................................*...................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v0.4S, v23.S[3] // ...............................................................................................*..................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v24.4S, v10.4S, v23.S[3] // ..................................................................................................*.................................................. + // gap // ..................................................................................................................................................... + sub v0.4S, v1.4S, v13.4S // .........................................................................................................*........................................... + // gap // ..................................................................................................................................................... + mul v12.4S, v10.4S, v23.S[2] // .................................................................................................*................................................... + // gap // ..................................................................................................................................................... + add v7.4S, v15.4S, v7.4S // ................................................................................................*.................................................... + // gap // ..................................................................................................................................................... + sqrdmulh v31.4S, v0.4S, v9.S[1] // ..............................................................................................................*...................................... + // gap // ..................................................................................................................................................... + mls v6.4S, v30.4S, v8.S[0] // ...................................................................................................*................................................. + // gap // ..................................................................................................................................................... + mls v12.4S, v24.4S, v8.S[0] // ......................................................................................................*.............................................. + // gap // ..................................................................................................................................................... + mul v3.4S, v0.4S, v9.S[0] // .............................................................................................................*....................................... + // gap // ..................................................................................................................................................... + add v30.4S, v1.4S, v13.4S // ..........................................................................................................*.......................................... + // gap // ..................................................................................................................................................... + srshr v13.4S, v7.4S, #23 // ................................................................................................................*.................................... + // gap // ..................................................................................................................................................... + add v2.4S, v28.4S, v27.4S // .....................................................................................................*............................................... + // gap // ..................................................................................................................................................... + mls v3.4S, v31.4S, v8.S[0] // ..................................................................................................................*.................................. + // gap // ..................................................................................................................................................... + mls v7.4S, v13.4S, v8.4S // ...................................................................................................................*................................. + // gap // ..................................................................................................................................................... + srshr v10.4S, v30.4S, #23 // .....................................................................................................................*............................... + // gap // ..................................................................................................................................................... + srshr v24.4S, v2.4S, #23 // .................................................................................................................*................................... + // gap // ..................................................................................................................................................... + add v0.4S, v12.4S, v3.4S // ......................................................................................................................................*.............. + // gap // ..................................................................................................................................................... + mls v30.4S, v10.4S, v8.4S // ........................................................................................................................*............................ + // gap // ..................................................................................................................................................... + mls v2.4S, v24.4S, v8.4S // ....................................................................................................................*................................ + // gap // ..................................................................................................................................................... + str q0, [x1, #48] // ..............................................................................................................................................*...... + // gap // ..................................................................................................................................................... + sub v19.4S, v28.4S, v27.4S // ....................................................................................................*................................................ + // gap // ..................................................................................................................................................... + sub v0.4S, v7.4S, v30.4S // ..............................................................................................................................*...................... + // gap // ..................................................................................................................................................... + add v7.4S, v7.4S, v30.4S // ...............................................................................................................................*..................... + // gap // ..................................................................................................................................................... + sub v24.4S, v17.4S, v2.4S // .........................................................................................................................*........................... + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v0.4S, v23.S[1] // ...................................................................................................................................*................. + // gap // ..................................................................................................................................................... + mul v0.4S, v0.4S, v23.S[0] // ..................................................................................................................................*.................. + // gap // ..................................................................................................................................................... + sqrdmulh v10.4S, v24.4S, v23.S[1] // .............................................................................................................................*....................... + // gap // ..................................................................................................................................................... + str q7, [x1, #16] // ............................................................................................................................................*........ + // gap // ..................................................................................................................................................... + mul v7.4S, v24.4S, v23.S[0] // ............................................................................................................................*........................ + // gap // ..................................................................................................................................................... + mls v0.4S, v30.4S, v8.S[0] // .......................................................................................................................................*............. + // gap // ..................................................................................................................................................... + mul v13.4S, v19.4S, v9.S[0] // .......................................................................................................*............................................. + // gap // ..................................................................................................................................................... + sqrdmulh v1.4S, v19.4S, v9.S[1] // ........................................................................................................*............................................ + // gap // ..................................................................................................................................................... + mls v7.4S, v10.4S, v8.S[0] // .................................................................................................................................*................... + // gap // ..................................................................................................................................................... + str q0, [x2, #16] // .................................................................................................................................................*... + // gap // ..................................................................................................................................................... + sub v0.4S, v12.4S, v3.4S // .....................................................................................................................................*............... + // gap // ..................................................................................................................................................... + mls v13.4S, v1.4S, v8.S[0] // ............................................................................................................*........................................ + // gap // ..................................................................................................................................................... + add v24.4S, v17.4S, v2.4S // ..........................................................................................................................*.......................... + // gap // ..................................................................................................................................................... + str q7, [x2], #(16*4) // ................................................................................................................................................*.... + // gap // ..................................................................................................................................................... + sqrdmulh v30.4S, v0.4S, v23.S[1] // .........................................................................................................................................*........... + // gap // ..................................................................................................................................................... + sub v7.4S, v6.4S, v13.4S // ......................................................................................................................*.............................. + // gap // ..................................................................................................................................................... + mul v0.4S, v0.4S, v23.S[0] // ........................................................................................................................................*............ + // gap // ..................................................................................................................................................... + add v1.4S, v6.4S, v13.4S // .......................................................................................................................*............................. + // gap // ..................................................................................................................................................... + sqrdmulh v10.4S, v7.4S, v23.S[1] // ................................................................................................................................*.................... + // gap // ..................................................................................................................................................... + mul v7.4S, v7.4S, v23.S[0] // ...........................................................................................................................*......................... + // gap // ..................................................................................................................................................... + str q1, [x1, #32] // .............................................................................................................................................*....... + // gap // ..................................................................................................................................................... + mls v0.4S, v30.4S, v8.S[0] // ...........................................................................................................................................*......... + // gap // ..................................................................................................................................................... + str q24, [x1], #(16*4) // ..........................................................................................................................................*.......... + add x1, x1, #64 // ...............................................................................................................................................*..... + mls v7.4S, v10.4S, v8.S[0] // ....................................................................................................................................*................ + // gap // ..................................................................................................................................................... + // gap // ..................................................................................................................................................... + // gap // ..................................................................................................................................................... + str q0, [x2, #-16] // ...................................................................................................................................................*. + // gap // ..................................................................................................................................................... + // gap // ..................................................................................................................................................... + // gap // ..................................................................................................................................................... + str q7, [x2, #-32] // ..................................................................................................................................................*.. + add x2, x2, #64 // ....................................................................................................................................................* + + // original source code + // trn1 v0.4S, v30.4S, v24.4S // *.................................................................................................................................................... + // trn2 v10.4S, v30.4S, v24.4S // .*................................................................................................................................................... + // trn2 v30.4S, v7.4S, v1.4S // ..*.................................................................................................................................................. + // trn2 v24.2D, v0.2D, v6.2D // ...*................................................................................................................................................. + // trn1 v0.2D, v0.2D, v6.2D // ....*................................................................................................................................................ + // trn2 v7.2D, v10.2D, v30.2D // ......*.............................................................................................................................................. + // trn1 v10.2D, v10.2D, v30.2D // .....*............................................................................................................................................... + // sub v30.4S, v24.4S, v7.4S // ........*............................................................................................................................................ + // add v24.4S, v24.4S, v7.4S // .........*........................................................................................................................................... + // sub v7.4S, v0.4S, v10.4S // .......*............................................................................................................................................. + // add v0.4S, v0.4S, v10.4S // ...............*..................................................................................................................................... + // trn1 v10.4S, v13.4S, v12.4S // ...................................*................................................................................................................. + // trn2 v1.4S, v13.4S, v12.4S // ..............*...................................................................................................................................... + // trn1 v13.4S, v17.4S, v27.4S // ....................*................................................................................................................................ + // trn2 v6.4S, v17.4S, v27.4S // ..................*.................................................................................................................................. + // mul v12.4S, v30.4S, v11.4S // ...........*......................................................................................................................................... + // sqrdmulh v30.4S, v30.4S, v19.4S // ............*........................................................................................................................................ + // sub v17.4S, v0.4S, v24.4S // ...................*................................................................................................................................. + // add v0.4S, v0.4S, v24.4S // ...........................*......................................................................................................................... + // mul v24.4S, v7.4S, v28.4S // .............*....................................................................................................................................... + // sqrdmulh v7.4S, v7.4S, v18.4S // ..........*.......................................................................................................................................... + // trn2 v28.2D, v10.2D, v13.2D // ..............................................*...................................................................................................... + // trn2 v27.2D, v1.2D, v6.2D // ...............................*..................................................................................................................... + // trn1 v10.2D, v10.2D, v13.2D // ...........................................*......................................................................................................... + // trn1 v1.2D, v1.2D, v6.2D // ..........................................*.......................................................................................................... + // mls v24.4S, v7.4S, v8.S[0] // .................*................................................................................................................................... + // mls v12.4S, v30.4S, v8.S[0] // ................*.................................................................................................................................... + // mul v30.4S, v17.4S, v15.4S // .......................*............................................................................................................................. + // sqrdmulh v7.4S, v17.4S, v21.4S // ......................*.............................................................................................................................. + // sub v13.4S, v10.4S, v1.4S // .............................................*....................................................................................................... + // sub v6.4S, v24.4S, v12.4S // .....................*............................................................................................................................... + // add v24.4S, v24.4S, v12.4S // ..........................*.......................................................................................................................... + // mls v30.4S, v7.4S, v8.S[0] // ............................*........................................................................................................................ + // mul v7.4S, v6.4S, v15.4S // .........................*........................................................................................................................... + // sqrdmulh v6.4S, v6.4S, v21.4S // ........................*............................................................................................................................ + // add v10.4S, v10.4S, v1.4S // ..........................................................*.......................................................................................... + // mul v1.4S, v13.4S, v25.4S // .................................................*................................................................................................... + // trn1 v12.4S, v0.4S, v24.4S // ..............................*...................................................................................................................... + // trn2 v0.4S, v0.4S, v24.4S // ................................*.................................................................................................................... + // mls v7.4S, v6.4S, v8.S[0] // .............................*....................................................................................................................... + // sqrdmulh v24.4S, v13.4S, v14.4S // ................................................*.................................................................................................... + // sub v13.4S, v28.4S, v27.4S // ...................................................*................................................................................................. + // add v6.4S, v28.4S, v27.4S // .........................................................*........................................................................................... + // trn1 v17.4S, v30.4S, v7.4S // ..................................*.................................................................................................................. + // mls v1.4S, v24.4S, v8.S[0] // .....................................................*............................................................................................... + // mul v24.4S, v13.4S, v5.4S // ......................................................*.............................................................................................. + // sqrdmulh v13.4S, v13.4S, v29.4S // .......................................................*............................................................................................. + // sub v28.4S, v10.4S, v6.4S // .............................................................*....................................................................................... + // add v10.4S, v10.4S, v6.4S // ..............................................................*...................................................................................... + // trn2 v30.4S, v30.4S, v7.4S // .................................*................................................................................................................... + // mls v24.4S, v13.4S, v8.S[0] // ...........................................................*......................................................................................... + // mul v7.4S, v28.4S, v2.4S // .................................................................*................................................................................... + // sqrdmulh v13.4S, v28.4S, v4.4S // ................................................................*.................................................................................... + // trn2 v6.2D, v12.2D, v17.2D // ......................................*.............................................................................................................. + // sub v28.4S, v1.4S, v24.4S // ...............................................................*..................................................................................... + // add v24.4S, v1.4S, v24.4S // .....................................................................*............................................................................... + // mls v7.4S, v13.4S, v8.S[0] // ......................................................................*.............................................................................. + // mul v1.4S, v28.4S, v2.4S // ...................................................................*................................................................................. + // sqrdmulh v13.4S, v28.4S, v4.4S // ..................................................................*.................................................................................. + // trn2 v28.2D, v0.2D, v30.2D // .....................................*............................................................................................................... + // trn1 v12.2D, v12.2D, v17.2D // .......................................*............................................................................................................. + // trn1 v0.2D, v0.2D, v30.2D // ....................................*................................................................................................................ + // mls v1.4S, v13.4S, v8.S[0] // .......................................................................*............................................................................. + // trn1 v30.4S, v10.4S, v24.4S // ........................................................................*............................................................................ + // trn2 v10.4S, v10.4S, v24.4S // ..........................................................................*.......................................................................... + // sub v24.4S, v12.4S, v0.4S // ....................................................*................................................................................................ + // trn1 v13.4S, v7.4S, v1.4S // ............................................................................*........................................................................ + // trn2 v7.4S, v7.4S, v1.4S // ...........................................................................*......................................................................... + // add v0.4S, v12.4S, v0.4S // .........................................*........................................................................................................... + // trn2 v1.2D, v30.2D, v13.2D // ................................................................................*.................................................................... + // trn2 v12.2D, v10.2D, v7.2D // ...............................................................................*..................................................................... + // trn1 v30.2D, v30.2D, v13.2D // .................................................................................*................................................................... + // trn1 v10.2D, v10.2D, v7.2D // ..............................................................................*...................................................................... + // mul v7.4S, v24.4S, v9.S[2] // ....................................................................................*................................................................ + // sqrdmulh v24.4S, v24.4S, v9.S[3] // ........................................................*............................................................................................ + // sub v13.4S, v6.4S, v28.4S // ............................................................*........................................................................................ + // add v6.4S, v6.4S, v28.4S // ........................................*............................................................................................................ + // sub v17.4S, v30.4S, v10.4S // ...................................................................................*................................................................. + // mls v7.4S, v24.4S, v8.S[0] // ........................................................................................*............................................................ + // mul v24.4S, v13.4S, v31.S[0] // .........................................................................*........................................................................... + // sqrdmulh v13.4S, v13.4S, v31.S[1] // ....................................................................*................................................................................ + // add v10.4S, v30.4S, v10.4S // ..............................................................................................*...................................................... + // mul v30.4S, v17.4S, v31.S[2] // .......................................................................................*............................................................. + // sqrdmulh v17.4S, v17.4S, v31.S[3] // ......................................................................................*.............................................................. + // mls v24.4S, v13.4S, v8.S[0] // .............................................................................*....................................................................... + // sub v13.4S, v1.4S, v12.4S // ..................................................................................*.................................................................. + // add v1.4S, v1.4S, v12.4S // .........................................................................................*........................................................... + // mls v30.4S, v17.4S, v8.S[0] // ...........................................................................................*......................................................... + // mul v12.4S, v13.4S, v3.S[0] // .....................................................................................*............................................................... + // sqrdmulh v13.4S, v13.4S, v3.S[1] // ..........................................................................................*.......................................................... + // sub v17.4S, v0.4S, v6.4S // .............................................................................................*....................................................... + // add v0.4S, v0.4S, v6.4S // ............................................*........................................................................................................ + // sub v6.4S, v7.4S, v24.4S // ............................................................................................*........................................................ + // mls v12.4S, v13.4S, v8.S[0] // ...............................................................................................*..................................................... + // mul v13.4S, v17.4S, v23.S[2] // ................................................................................................*.................................................... + // sqrdmulh v17.4S, v17.4S, v23.S[3] // .................................................................................................*................................................... + // add v24.4S, v7.4S, v24.4S // .....................................................................................................*............................................... + // mul v7.4S, v6.4S, v23.S[2] // ....................................................................................................*................................................ + // sqrdmulh v6.4S, v6.4S, v23.S[3] // ..................................................................................................*.................................................. + // mls v13.4S, v17.4S, v8.S[0] // .......................................................................................................*............................................. + // sub v17.4S, v10.4S, v1.4S // .....................................................................................................................*............................... + // add v10.4S, v10.4S, v1.4S // ............................................................................................................*........................................ + // mls v7.4S, v6.4S, v8.S[0] // ........................................................................................................*............................................ + // mul v1.4S, v17.4S, v9.S[0] // ...............................................................................................................................*..................... + // sqrdmulh v6.4S, v17.4S, v9.S[1] // ................................................................................................................................*.................... + // sub v17.4S, v30.4S, v12.4S // ...................................................................................................*................................................. + // add v30.4S, v30.4S, v12.4S // ..........................................................................................................*.......................................... + // srshr v12.4S, v0.4S, #23 // ...............................................*..................................................................................................... + // mls v1.4S, v6.4S, v8.S[0] // ....................................................................................................................................*................ + // mul v6.4S, v17.4S, v9.S[0] // .........................................................................................................*........................................... + // sqrdmulh v17.4S, v17.4S, v9.S[1] // ......................................................................................................*.............................................. + // mls v0.4S, v12.4S, v8.4S // ..................................................*.................................................................................................. + // srshr v12.4S, v24.4S, #23 // ...........................................................................................................*......................................... + // srshr v28.4S, v10.4S, #23 // ................................................................................................................*.................................... + // mls v6.4S, v17.4S, v8.S[0] // .............................................................................................................*....................................... + // mls v24.4S, v12.4S, v8.4S // ..............................................................................................................*...................................... + // mls v10.4S, v28.4S, v8.4S // ...................................................................................................................*................................. + // srshr v12.4S, v30.4S, #23 // ...............................................................................................................*..................................... + // sub v17.4S, v13.4S, v1.4S // ........................................................................................................................................*............ + // add v1.4S, v13.4S, v1.4S // ..........................................................................................................................................*.......... + // mls v30.4S, v12.4S, v8.4S // ..................................................................................................................*.................................. + // sub v13.4S, v0.4S, v10.4S // ........................................................................................................................*............................ + // add v0.4S, v0.4S, v10.4S // .....................................................................................................................................*............... + // mul v10.4S, v17.4S, v23.S[0] // ............................................................................................................................................*........ + // mul v12.4S, v13.4S, v23.S[0] // .............................................................................................................................*....................... + // sqrdmulh v13.4S, v13.4S, v23.S[1] // ...........................................................................................................................*......................... + // sub v28.4S, v24.4S, v30.4S // ......................................................................................................................*.............................. + // add v30.4S, v24.4S, v30.4S // .......................................................................................................................*............................. + // sqrdmulh v24.4S, v17.4S, v23.S[1] // ...........................................................................................................................................*......... + // mls v12.4S, v13.4S, v8.S[0] // .................................................................................................................................*................... + // mul v13.4S, v28.4S, v23.S[0] // ..........................................................................................................................*.......................... + // sqrdmulh v17.4S, v28.4S, v23.S[1] // .........................................................................................................................*........................... + // mls v10.4S, v24.4S, v8.S[0] // .................................................................................................................................................*... + // sub v24.4S, v7.4S, v6.4S // ...................................................................................................................................*................. + // add v7.4S, v7.4S, v6.4S // .................................................................................................................*................................... + // mls v13.4S, v17.4S, v8.S[0] // ..............................................................................................................................*...................... + // mul v6.4S, v24.4S, v23.S[0] // .........................................................................................................................................*........... + // sqrdmulh v24.4S, v24.4S, v23.S[1] // .......................................................................................................................................*............. + // str q0, [x1], #(16*4) // ...............................................................................................................................................*..... + // mls v6.4S, v24.4S, v8.S[0] // ..............................................................................................................................................*...... + // str q30, [x1, #-48] // ............................................................................................................................*........................ + // str q1, [x1, #-32] // .............................................................................................................................................*....... + // str q7, [x1, #-16] // ....................................................................................................................*................................ + // add x1, x1, #64 // ................................................................................................................................................*.... + // str q12, [x2], #(16*4) // ......................................................................................................................................*.............. + // str q13, [x2, #-48] // ..................................................................................................................................*.................. + // str q10, [x2, #-32] // ...................................................................................................................................................*. + // str q6, [x2, #-16] // ..................................................................................................................................................*.. + // add x2, x2, #64 // ....................................................................................................................................................* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q12, [x0, #256] // ..*......... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q21, [x0, #384] // ........*... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q15, [x0, #896] // .......*.... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q17, [x0, #512] // ...*........ + // gap // ............ + // gap // ............ + // gap // ............ + ldr q27, [x0, #768] // .....*...... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q28, [x0, #640] // ....*....... + // gap // ............ + // gap // ............ + // gap // ............ + add v19.4S, v12.4S, v21.4S // ..........*. + // gap // ............ + add v11.4S, v27.4S, v15.4S // .........*.. + // gap // ............ + add v18.4S, v17.4S, v28.4S // ......*..... + // gap // ............ + ldr q7, [x0, #0] // *........... + // gap // ............ + // gap // ............ + // gap // ............ + add v4.4S, v18.4S, v11.4S // ...........* + // gap // ............ + ldr q6, [x0, #128] // .*.......... + // gap // ............ + + // original source code + // ldr q7, [x0, #0] // .........*.. + // ldr q6, [x0, #128] // ...........* + // ldr q12, [x0, #256] // *........... + // ldr q17, [x0, #512] // ...*........ + // ldr q28, [x0, #640] // .....*...... + // ldr q27, [x0, #768] // ....*....... + // add v18.4S, v17.4S, v28.4S // ........*... + // ldr q15, [x0, #896] // ..*......... + // ldr q21, [x0, #384] // .*.......... + // add v11.4S, v27.4S, v15.4S // .......*.... + // add v19.4S, v12.4S, v21.4S // ......*..... + // add v4.4S, v18.4S, v11.4S // ..........*. + + sub count, count, #1 +layer123_start: + sub v10.4S, v7.4S, v6.4S // ........*............................................................................................................... + // gap // ........................................................................................................................ + add v24.4S, v7.4S, v6.4S // .........*.............................................................................................................. + // gap // ........................................................................................................................ + sub v7.4S, v12.4S, v21.4S // .............*.......................................................................................................... + // gap // ........................................................................................................................ + mul v13.4S, v10.4S, v1.S[2] // ..........*............................................................................................................. + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v10.4S, v1.S[3] // ...........*............................................................................................................ + // gap // ........................................................................................................................ + sub v6.4S, v24.4S, v19.4S // ............................*........................................................................................... + // gap // ........................................................................................................................ + add v24.4S, v24.4S, v19.4S // .............................*.......................................................................................... + // gap // ........................................................................................................................ + mul v12.4S, v7.4S, v2.S[0] // ...............*........................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v7.4S, v2.S[1] // ................*....................................................................................................... + // gap // ........................................................................................................................ + mls v13.4S, v10.4S, v8.S[0] // ............*........................................................................................................... + // gap // ........................................................................................................................ + sub v10.4S, v17.4S, v28.4S // ..................*..................................................................................................... + // gap // ........................................................................................................................ + mul v17.4S, v6.4S, v0.S[2] // ..............................*......................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v6.4S, v0.S[3] // ...............................*........................................................................................ + // gap // ........................................................................................................................ + sub v28.4S, v24.4S, v4.4S // ................................................*....................................................................... + // gap // ........................................................................................................................ + add v24.4S, v24.4S, v4.4S // .................................................*...................................................................... + // gap // ........................................................................................................................ + mls v12.4S, v7.4S, v8.S[0] // .................*...................................................................................................... + // gap // ........................................................................................................................ + mul v7.4S, v10.4S, v2.S[2] // ....................*................................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v10.4S, v2.S[3] // .....................*.................................................................................................. + // gap // ........................................................................................................................ + sub v27.4S, v27.4S, v15.4S // .......................*................................................................................................ + // gap // ........................................................................................................................ + sub v15.4S, v13.4S, v12.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + add v13.4S, v13.4S, v12.4S // ..................................*..................................................................................... + // gap // ........................................................................................................................ + mls v7.4S, v10.4S, v8.S[0] // ......................*................................................................................................. + // gap // ........................................................................................................................ + mul v10.4S, v27.4S, v3.S[0] // .........................*.............................................................................................. + // gap // ........................................................................................................................ + mls v17.4S, v6.4S, v8.S[0] // ................................*....................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v27.4S, v3.S[1] // ..........................*............................................................................................. + // gap // ........................................................................................................................ + mul v12.4S, v15.4S, v0.S[2] // ...................................*.................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v27.4S, v15.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + mul v15.4S, v28.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + sqrdmulh v28.4S, v28.4S, v0.S[1] // ...................................................*.................................................................... + // gap // ........................................................................................................................ + mul v21.4S, v24.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + sqrdmulh v24.4S, v24.4S, v26.4S // .........................................................................................*.............................. + // gap // ........................................................................................................................ + mls v10.4S, v6.4S, v8.S[0] // ...........................*............................................................................................ + // gap // ........................................................................................................................ + mls v12.4S, v27.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + sub v6.4S, v18.4S, v11.4S // ......................................*................................................................................. + // gap // ........................................................................................................................ + mls v15.4S, v28.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + sub v28.4S, v7.4S, v10.4S // ...........................................*............................................................................ + // gap // ........................................................................................................................ + mul v27.4S, v6.4S, v1.S[0] // ........................................*............................................................................... + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v6.4S, v1.S[1] // .........................................*.............................................................................. + // gap // ........................................................................................................................ + add v10.4S, v7.4S, v10.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + mul v7.4S, v28.4S, v1.S[0] // .............................................*.......................................................................... + // gap // ........................................................................................................................ + sqrdmulh v28.4S, v28.4S, v1.S[1] // ..............................................*......................................................................... + // gap // ........................................................................................................................ + sub v18.4S, v13.4S, v10.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + add v10.4S, v13.4S, v10.4S // ......................................................*................................................................. + // gap // ........................................................................................................................ + mls v27.4S, v6.4S, v8.S[0] // ..........................................*............................................................................. + // gap // ........................................................................................................................ + mls v7.4S, v28.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + mul v13.4S, v18.4S, v0.S[0] // .......................................................*................................................................ + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v18.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + sub v28.4S, v17.4S, v27.4S // ..........................................................*............................................................. + // gap // ........................................................................................................................ + add v17.4S, v17.4S, v27.4S // ...........................................................*............................................................ + // gap // ........................................................................................................................ + sub v27.4S, v12.4S, v7.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + mls v13.4S, v6.4S, v8.S[0] // .........................................................*.............................................................. + // gap // ........................................................................................................................ + mul v6.4S, v28.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + sqrdmulh v28.4S, v28.4S, v0.S[1] // .............................................................*.......................................................... + // gap // ........................................................................................................................ + add v7.4S, v12.4S, v7.4S // ................................................................*....................................................... + // gap // ........................................................................................................................ + mul v12.4S, v27.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + sqrdmulh v27.4S, v27.4S, v0.S[1] // ..................................................................*..................................................... + // gap // ........................................................................................................................ + mls v6.4S, v28.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + cmge v28.4S, v31.4S, v15.4S // ....................................................................*................................................... + // gap // ........................................................................................................................ + cmge v18.4S, v15.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + mls v21.4S, v24.4S, v8.S[0] // ..........................................................................................*............................. + // gap // ........................................................................................................................ + mls v12.4S, v27.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + sub v24.4S, v28.4S, v18.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + cmge v28.4S, v31.4S, v13.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + cmge v27.4S, v13.4S, v30.4S // .........................................................................*.............................................. + // gap // ........................................................................................................................ + mls v15.4S, v24.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + sub v24.4S, v28.4S, v27.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + cmge v28.4S, v31.4S, v6.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + cmge v27.4S, v6.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + mls v13.4S, v24.4S, v8.4S // ...........................................................................*............................................ + // gap // ........................................................................................................................ + sub v24.4S, v28.4S, v27.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + cmge v28.4S, v31.4S, v12.4S // ................................................................................*....................................... + // gap // ........................................................................................................................ + cmge v27.4S, v12.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + mls v6.4S, v24.4S, v8.4S // ...............................................................................*........................................ + // gap // ........................................................................................................................ + sub v24.4S, v28.4S, v27.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + str q15, [x0, #512] // ....................................................................................*................................... + // gap // ........................................................................................................................ + mul v28.4S, v10.4S, v25.4S // ...........................................................................................*............................ + // gap // ........................................................................................................................ + mls v12.4S, v24.4S, v8.4S // ...................................................................................*.................................... + // gap // ........................................................................................................................ + str q13, [x0, #640] // .....................................................................................*.................................. + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v10.4S, v26.4S // ............................................................................................*........................... + // gap // ........................................................................................................................ + str q6, [x0, #768] // ......................................................................................*................................. + // gap // ........................................................................................................................ + mul v24.4S, v17.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + str q12, [x0, #896] // .......................................................................................*................................ + // gap // ........................................................................................................................ + mls v28.4S, v10.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + sqrdmulh v10.4S, v17.4S, v26.4S // ...............................................................................................*........................ + // gap // ........................................................................................................................ + mul v13.4S, v7.4S, v25.4S // .................................................................................................*...................... + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v7.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + cmge v6.4S, v31.4S, v21.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + mls v24.4S, v10.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + cmge v10.4S, v21.4S, v30.4S // .....................................................................................................*.................. + // gap // ........................................................................................................................ + mls v13.4S, v7.4S, v8.S[0] // ...................................................................................................*.................... + // gap // ........................................................................................................................ + sub v10.4S, v6.4S, v10.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + cmge v7.4S, v31.4S, v28.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + cmge v6.4S, v28.4S, v30.4S // .........................................................................................................*.............. + // gap // ........................................................................................................................ + mls v21.4S, v10.4S, v8.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + sub v10.4S, v7.4S, v6.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + cmge v7.4S, v31.4S, v24.4S // ............................................................................................................*........... + // gap // ........................................................................................................................ + cmge v6.4S, v24.4S, v30.4S // .............................................................................................................*.......... + // gap // ........................................................................................................................ + mls v28.4S, v10.4S, v8.4S // ...........................................................................................................*............ + // gap // ........................................................................................................................ + sub v10.4S, v7.4S, v6.4S // ..............................................................................................................*......... + // gap // ........................................................................................................................ + cmge v7.4S, v31.4S, v13.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + cmge v6.4S, v13.4S, v30.4S // .................................................................................................................*...... + // gap // ........................................................................................................................ + mls v24.4S, v10.4S, v8.4S // ...............................................................................................................*........ + // gap // ........................................................................................................................ + sub v10.4S, v7.4S, v6.4S // ..................................................................................................................*..... + // gap // ........................................................................................................................ + str q21, [x0], #(16) // ....................................................................................................................*... + // gap // ........................................................................................................................ + ldr q7, [x0, #0] // e....................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v10.4S, v8.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + str q28, [x0, #112] // .....................................................................................................................*.. + // gap // ........................................................................................................................ + ldr q6, [x0, #128] // .e...................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q24, [x0, #240] // ......................................................................................................................*. + // gap // ........................................................................................................................ + ldr q12, [x0, #256] // ..e..................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q17, [x0, #512] // ....e................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q28, [x0, #640] // .....e.................................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q27, [x0, #768] // ......e................................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v18.4S, v17.4S, v28.4S // ...................e.................................................................................................... + // gap // ........................................................................................................................ + ldr q15, [x0, #896] // .......e................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q21, [x0, #384] // ...e.................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v11.4S, v27.4S, v15.4S // ........................e............................................................................................... + // gap // ........................................................................................................................ + str q13, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + add v19.4S, v12.4S, v21.4S // ..............e......................................................................................................... + // gap // ........................................................................................................................ + add v4.4S, v18.4S, v11.4S // .......................................e................................................................................ + // gap // ........................................................................................................................ + + // original source code + // ldr q9, [x0, #0] // e...............|.......................................................................................................e............. + // ldr q10, [x0, #(1*(1024/8))] // ...e............|..........................................................................................................e.......... + // ldr q11, [x0, #(2*(1024/8))] // .....e..........|............................................................................................................e........ + // ldr q12, [x0, #(3*(1024/8))] // ...........e....|..................................................................................................................e.. + // ldr q13, [x0, #(4*(1024/8))] // ......e.........|.............................................................................................................e....... + // ldr q14, [x0, #(5*(1024/8))] // .......e........|..............................................................................................................e...... + // ldr q15, [x0, #(6*(1024/8))] // ........e.......|...............................................................................................................e..... + // ldr q16, [x0, #(7*(1024/8))] // ..........e.....|.................................................................................................................e... + // sub v24.4s, v9.4s, v10.4s // ................*..................................................................................................................... + // add v9.4s, v9.4s, v10.4s // ................|*.................................................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ................|..*.................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................|...*................................................................................................................. + // mls v10.4s, v24.4s, v8.s[0] // ................|........*............................................................................................................ + // sub v24.4s, v11.4s, v12.4s // ................|.*................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ..............e.|..................................................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ................|......*.............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................|.......*............................................................................................................. + // mls v12.4s, v24.4s, v8.s[0] // ................|..............*...................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ................|.........*........................................................................................................... + // add v13.4s, v13.4s, v14.4s // .........e......|................................................................................................................e.... + // mul v14.4s, v24.4s, v2.s[2] // ................|...............*..................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................|................*.................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................|....................*................................................................................................ + // sub v24.4s, v15.4s, v16.4s // ................|.................*................................................................................................... + // add v15.4s, v15.4s, v16.4s // ............e...|...................................................................................................................e. + // mul v16.4s, v24.4s, v3.s[0] // ................|.....................*............................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................|.......................*............................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|..............................*...................................................................................... + // sub v24.4s, v9.4s, v11.4s // ................|....*................................................................................................................ + // add v9.4s, v9.4s, v11.4s // ................|.....*............................................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ................|..........*.......................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|...........*......................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ................|......................*.............................................................................................. + // sub v24.4s, v10.4s, v12.4s // ................|..................*.................................................................................................. + // add v10.4s, v10.4s, v12.4s // ................|...................*................................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ................|........................*............................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|.........................*........................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................|...............................*..................................................................................... + // sub v24.4s, v13.4s, v15.4s // ................|................................*.................................................................................... + // add v13.4s, v13.4s, v15.4s // ...............e|..................................................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ................|...................................*................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|....................................*................................................................................ + // mls v15.4s, v24.4s, v8.s[0] // ................|..........................................*.......................................................................... + // sub v24.4s, v14.4s, v16.4s // ................|..................................*.................................................................................. + // add v14.4s, v14.4s, v16.4s // ................|.....................................*............................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ................|......................................*.............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|.......................................*............................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................*......................................................................... + // sub v24.4s, v9.4s, v13.4s // ................|............*........................................................................................................ + // add v9.4s, v9.4s, v13.4s // ................|.............*....................................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ................|..........................*.......................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...........................*......................................................................................... + // mls v13.4s, v24.4s, v8.s[0] // ................|.................................*................................................................................... + // sub v24.4s, v10.4s, v14.4s // ................|........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ................|.........................................*........................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ................|............................................*........................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|.............................................*....................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................|.................................................*................................................................... + // sub v24.4s, v11.4s, v15.4s // ................|..............................................*...................................................................... + // add v11.4s, v11.4s, v15.4s // ................|...............................................*..................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ................|..................................................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...................................................*................................................................. + // mls v15.4s, v24.4s, v8.s[0] // ................|.......................................................*............................................................. + // sub v24.4s, v12.4s, v16.4s // ................|................................................*.................................................................... + // add v12.4s, v12.4s, v16.4s // ................|....................................................*................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ................|.....................................................*............................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|......................................................*.............................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................................*......................................................... + // cmge v27.4s, v31.4s, v13.4s // ................|........................................................*............................................................ + // cmge v28.4s, v13.4s, v30.4s // ................|.........................................................*........................................................... + // sub v28.4s, v27.4s, v28.4s // ................|............................................................*........................................................ + // mls v13.4s, v28.4s, v8.4s // ................|...............................................................*..................................................... + // cmge v27.4s, v31.4s, v14.4s // ................|.............................................................*....................................................... + // cmge v28.4s, v14.4s, v30.4s // ................|..............................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ................|................................................................*.................................................... + // mls v14.4s, v28.4s, v8.4s // ................|...................................................................*................................................. + // cmge v27.4s, v31.4s, v15.4s // ................|.................................................................*................................................... + // cmge v28.4s, v15.4s, v30.4s // ................|..................................................................*.................................................. + // sub v28.4s, v27.4s, v28.4s // ................|....................................................................*................................................ + // mls v15.4s, v28.4s, v8.4s // ................|.......................................................................*............................................. + // cmge v27.4s, v31.4s, v16.4s // ................|.....................................................................*............................................... + // cmge v28.4s, v16.4s, v30.4s // ................|......................................................................*.............................................. + // sub v28.4s, v27.4s, v28.4s // ................|........................................................................*............................................ + // mls v16.4s, v28.4s, v8.4s // ................|...........................................................................*......................................... + // str q13, [x0, #(4*(1024/8))] // ................|.........................................................................*........................................... + // str q14, [x0, #(5*(1024/8))] // ................|............................................................................*........................................ + // str q15, [x0, #(6*(1024/8))] // ................|..............................................................................*...................................... + // str q16, [x0, #(7*(1024/8))] // ................|................................................................................*.................................... + // mul v13.4s, v9.4s, v25.4s // ................|............................*........................................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // ................|.............................*....................................................................................... + // mls v13.4s, v9.4s, v8.s[0] // ................|..........................................................*.......................................................... + // mul v14.4s, v10.4s, v25.4s // ................|..........................................................................*.......................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ................|.............................................................................*....................................... + // mls v14.4s, v10.4s, v8.s[0] // ................|.................................................................................*................................... + // mul v15.4s, v11.4s, v25.4s // ................|...............................................................................*..................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ................|..................................................................................*.................................. + // mls v15.4s, v11.4s, v8.s[0] // ................|......................................................................................*.............................. + // mul v16.4s, v12.4s, v25.4s // ................|...................................................................................*................................. + // sqrdmulh v12.4s, v12.4s, v26.4s // ................|....................................................................................*................................ + // mls v16.4s, v12.4s, v8.s[0] // ................|........................................................................................*............................ + // cmge v27.4s, v31.4s, v13.4s // ................|.....................................................................................*............................... + // cmge v28.4s, v13.4s, v30.4s // ................|.......................................................................................*............................. + // sub v28.4s, v27.4s, v28.4s // ................|.........................................................................................*........................... + // mls v13.4s, v28.4s, v8.4s // ................|............................................................................................*........................ + // cmge v27.4s, v31.4s, v14.4s // ................|..........................................................................................*.......................... + // cmge v28.4s, v14.4s, v30.4s // ................|...........................................................................................*......................... + // sub v28.4s, v27.4s, v28.4s // ................|.............................................................................................*....................... + // mls v14.4s, v28.4s, v8.4s // ................|................................................................................................*.................... + // cmge v27.4s, v31.4s, v15.4s // ................|..............................................................................................*...................... + // cmge v28.4s, v15.4s, v30.4s // ................|...............................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ................|.................................................................................................*................... + // mls v15.4s, v28.4s, v8.4s // ................|....................................................................................................*................ + // cmge v27.4s, v31.4s, v16.4s // ................|..................................................................................................*.................. + // cmge v28.4s, v16.4s, v30.4s // ................|...................................................................................................*................. + // sub v28.4s, v27.4s, v28.4s // ................|.....................................................................................................*............... + // mls v16.4s, v28.4s, v8.4s // .*..............|........................................................................................................*............ + // str q13, [x0], #(16) // ................|......................................................................................................*.............. + // str q14, [x0, #(-16 + 1*(1024/8))] // ..*.............|.........................................................................................................*........... + // str q15, [x0, #(-16 + 2*(1024/8))] // ....*...........|...........................................................................................................*......... + // str q16, [x0, #(-16 + 3*(1024/8))] // .............*..|....................................................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + sub v5.4S, v7.4S, v6.4S // *........................................................................................................... + // gap // ............................................................................................................ + sub v27.4S, v27.4S, v15.4S // ..................*......................................................................................... + // gap // ............................................................................................................ + sub v10.4S, v17.4S, v28.4S // ..........*................................................................................................. + // gap // ............................................................................................................ + sqrdmulh v14.4S, v5.4S, v1.S[3] // ....*....................................................................................................... + // gap // ............................................................................................................ + mul v13.4S, v27.4S, v3.S[0] // ......................*..................................................................................... + // gap // ............................................................................................................ + sub v12.4S, v12.4S, v21.4S // ..*......................................................................................................... + // gap // ............................................................................................................ + mul v15.4S, v10.4S, v2.S[2] // ................*........................................................................................... + // gap // ............................................................................................................ + sqrdmulh v20.4S, v10.4S, v2.S[3] // .................*.......................................................................................... + // gap // ............................................................................................................ + sqrdmulh v10.4S, v27.4S, v3.S[1] // ........................*................................................................................... + // gap // ............................................................................................................ + sqrdmulh v27.4S, v12.4S, v2.S[1] // ........*................................................................................................... + // gap // ............................................................................................................ + mul v17.4S, v12.4S, v2.S[0] // .......*.................................................................................................... + // gap // ............................................................................................................ + mul v29.4S, v5.4S, v1.S[2] // ...*........................................................................................................ + // gap // ............................................................................................................ + mls v13.4S, v10.4S, v8.S[0] // ...............................*............................................................................ + // gap // ............................................................................................................ + mls v15.4S, v20.4S, v8.S[0] // .....................*...................................................................................... + // gap // ............................................................................................................ + mls v17.4S, v27.4S, v8.S[0] // ...............*............................................................................................ + // gap // ............................................................................................................ + mls v29.4S, v14.4S, v8.S[0] // .........*.................................................................................................. + // gap // ............................................................................................................ + add v5.4S, v7.4S, v6.4S // .*.......................................................................................................... + // gap // ............................................................................................................ + sub v10.4S, v15.4S, v13.4S // ...................................*........................................................................ + // gap // ............................................................................................................ + add v28.4S, v15.4S, v13.4S // ......................................*..................................................................... + // gap // ............................................................................................................ + sub v7.4S, v29.4S, v17.4S // ...................*........................................................................................ + // gap // ............................................................................................................ + sqrdmulh v24.4S, v10.4S, v1.S[1] // ........................................*................................................................... + // gap // ............................................................................................................ + mul v21.4S, v10.4S, v1.S[0] // .......................................*.................................................................... + // gap // ............................................................................................................ + sqrdmulh v13.4S, v7.4S, v0.S[3] // ..........................*................................................................................. + // gap // ............................................................................................................ + mul v22.4S, v7.4S, v0.S[2] // .........................*.................................................................................. + // gap // ............................................................................................................ + add v16.4S, v5.4S, v19.4S // ......*..................................................................................................... + // gap // ............................................................................................................ + mls v21.4S, v24.4S, v8.S[0] // ............................................*............................................................... + // gap // ............................................................................................................ + add v14.4S, v29.4S, v17.4S // ....................*....................................................................................... + // gap // ............................................................................................................ + mls v22.4S, v13.4S, v8.S[0] // ................................*........................................................................... + // gap // ............................................................................................................ + add v15.4S, v16.4S, v4.4S // ..............*............................................................................................. + // gap // ............................................................................................................ + add v24.4S, v14.4S, v28.4S // ..........................................*................................................................. + // gap // ............................................................................................................ + sub v18.4S, v18.4S, v11.4S // .................................*.......................................................................... + // gap // ............................................................................................................ + add v10.4S, v22.4S, v21.4S // .....................................................*...................................................... + // gap // ............................................................................................................ + sqrdmulh v12.4S, v24.4S, v26.4S // ..............................................................................*............................. + // gap // ............................................................................................................ + mul v6.4S, v24.4S, v25.4S // ...........................................................................*................................ + // gap // ............................................................................................................ + sqrdmulh v13.4S, v10.4S, v26.4S // .....................................................................................*...................... + // gap // ............................................................................................................ + mul v7.4S, v10.4S, v25.4S // ....................................................................................*....................... + // gap // ............................................................................................................ + mul v27.4S, v15.4S, v25.4S // .............................*.............................................................................. + // gap // ............................................................................................................ + sqrdmulh v10.4S, v15.4S, v26.4S // ..............................*............................................................................. + // gap // ............................................................................................................ + sub v20.4S, v5.4S, v19.4S // .....*...................................................................................................... + // gap // ............................................................................................................ + mls v7.4S, v13.4S, v8.S[0] // .........................................................................................*.................. + // gap // ............................................................................................................ + sqrdmulh v17.4S, v18.4S, v1.S[1] // .....................................*...................................................................... + // gap // ............................................................................................................ + mls v27.4S, v10.4S, v8.S[0] // ...........................................................*................................................ + // gap // ............................................................................................................ + sqrdmulh v11.4S, v20.4S, v0.S[3] // ............*............................................................................................... + // gap // ............................................................................................................ + cmge v10.4S, v7.4S, v30.4S // ....................................................................................................*....... + // gap // ............................................................................................................ + cmge v24.4S, v31.4S, v7.4S // ...................................................................................................*........ + // gap // ............................................................................................................ + cmge v23.4S, v31.4S, v27.4S // ......................................................................................*..................... + // gap // ............................................................................................................ + sub v10.4S, v24.4S, v10.4S // ......................................................................................................*..... + // gap // ............................................................................................................ + mul v13.4S, v18.4S, v1.S[0] // ....................................*....................................................................... + // gap // ............................................................................................................ + cmge v15.4S, v27.4S, v30.4S // ........................................................................................*................... + // gap // ............................................................................................................ + mls v7.4S, v10.4S, v8.4S // ........................................................................................................*... + // gap // ............................................................................................................ + sub v10.4S, v23.4S, v15.4S // ..........................................................................................*................. + // gap // ............................................................................................................ + mls v13.4S, v17.4S, v8.S[0] // ...........................................*................................................................ + // gap // ............................................................................................................ + mul v5.4S, v20.4S, v0.S[2] // ...........*................................................................................................ + // gap // ............................................................................................................ + str q7, [x0, #384] // ...........................................................................................................* + // gap // ............................................................................................................ + mls v27.4S, v10.4S, v8.4S // .............................................................................................*.............. + // gap // ............................................................................................................ + mls v6.4S, v12.4S, v8.S[0] // ..................................................................................*......................... + // gap // ............................................................................................................ + mls v5.4S, v11.4S, v8.S[0] // .......................*.................................................................................... + // gap // ............................................................................................................ + sub v17.4S, v22.4S, v21.4S // .................................................*.......................................................... + // gap // ............................................................................................................ + str q27, [x0], #(16) // .......................................................................................................*.... + // gap // ............................................................................................................ + cmge v24.4S, v31.4S, v6.4S // ...........................................................................................*................ + // gap // ............................................................................................................ + add v7.4S, v5.4S, v13.4S // ................................................*........................................................... + // gap // ............................................................................................................ + sub v27.4S, v5.4S, v13.4S // ...............................................*............................................................ + // gap // ............................................................................................................ + cmge v10.4S, v6.4S, v30.4S // ............................................................................................*............... + // gap // ............................................................................................................ + mul v13.4S, v7.4S, v25.4S // ................................................................................*........................... + // gap // ............................................................................................................ + sqrdmulh v7.4S, v7.4S, v26.4S // ...................................................................................*........................ + // gap // ............................................................................................................ + sub v10.4S, v24.4S, v10.4S // ..............................................................................................*............. + // gap // ............................................................................................................ + sqrdmulh v12.4S, v17.4S, v0.S[1] // .......................................................*.................................................... + // gap // ............................................................................................................ + mul v15.4S, v17.4S, v0.S[0] // ......................................................*..................................................... + // gap // ............................................................................................................ + mls v13.4S, v7.4S, v8.S[0] // .......................................................................................*.................... + // gap // ............................................................................................................ + mls v6.4S, v10.4S, v8.4S // .................................................................................................*.......... + // gap // ............................................................................................................ + sqrdmulh v11.4S, v27.4S, v0.S[1] // ....................................................*....................................................... + // gap // ............................................................................................................ + mls v15.4S, v12.4S, v8.S[0] // ............................................................*............................................... + // gap // ............................................................................................................ + cmge v24.4S, v31.4S, v13.4S // ...............................................................................................*............ + // gap // ............................................................................................................ + cmge v10.4S, v13.4S, v30.4S // ................................................................................................*........... + // gap // ............................................................................................................ + mul v17.4S, v27.4S, v0.S[0] // ...................................................*........................................................ + // gap // ............................................................................................................ + sub v10.4S, v24.4S, v10.4S // ..................................................................................................*......... + // gap // ............................................................................................................ + sub v12.4S, v14.4S, v28.4S // .........................................*.................................................................. + // gap // ............................................................................................................ + cmge v7.4S, v31.4S, v15.4S // ......................................................................*..................................... + // gap // ............................................................................................................ + cmge v24.4S, v15.4S, v30.4S // .......................................................................*.................................... + // gap // ............................................................................................................ + mls v17.4S, v11.4S, v8.S[0] // ........................................................*................................................... + // gap // ............................................................................................................ + sub v24.4S, v7.4S, v24.4S // .........................................................................*.................................. + // gap // ............................................................................................................ + sqrdmulh v28.4S, v12.4S, v0.S[1] // ..............................................*............................................................. + // gap // ............................................................................................................ + mul v27.4S, v12.4S, v0.S[0] // .............................................*.............................................................. + // gap // ............................................................................................................ + cmge v12.4S, v31.4S, v17.4S // ..................................................................*......................................... + // gap // ............................................................................................................ + cmge v7.4S, v17.4S, v30.4S // ...................................................................*........................................ + // gap // ............................................................................................................ + sub v19.4S, v16.4S, v4.4S // .............*.............................................................................................. + // gap // ............................................................................................................ + sub v7.4S, v12.4S, v7.4S // .....................................................................*...................................... + // gap // ............................................................................................................ + mls v27.4S, v28.4S, v8.S[0] // ..................................................*......................................................... + // gap // ............................................................................................................ + sqrdmulh v11.4S, v19.4S, v0.S[1] // ............................*............................................................................... + // gap // ............................................................................................................ + mls v17.4S, v7.4S, v8.4S // ........................................................................*................................... + // gap // ............................................................................................................ + mul v28.4S, v19.4S, v0.S[0] // ...........................*................................................................................ + // gap // ............................................................................................................ + cmge v12.4S, v27.4S, v30.4S // ...............................................................*............................................ + // gap // ............................................................................................................ + cmge v7.4S, v31.4S, v27.4S // ..............................................................*............................................. + // gap // ............................................................................................................ + str q17, [x0, #752] // ...............................................................................*............................ + // gap // ............................................................................................................ + sub v7.4S, v7.4S, v12.4S // .................................................................*.......................................... + // gap // ............................................................................................................ + mls v28.4S, v11.4S, v8.S[0] // ..................................*......................................................................... + // gap // ............................................................................................................ + str q6, [x0, #112] // .........................................................................................................*.. + // gap // ............................................................................................................ + mls v27.4S, v7.4S, v8.4S // ....................................................................*....................................... + // gap // ............................................................................................................ + mls v15.4S, v24.4S, v8.4S // ............................................................................*............................... + // gap // ............................................................................................................ + cmge v6.4S, v31.4S, v28.4S // .........................................................*.................................................. + // gap // ............................................................................................................ + cmge v24.4S, v28.4S, v30.4S // ..........................................................*................................................. + // gap // ............................................................................................................ + str q27, [x0, #624] // .............................................................................*.............................. + // gap // ............................................................................................................ + sub v24.4S, v6.4S, v24.4S // .............................................................*.............................................. + // gap // ............................................................................................................ + mls v13.4S, v10.4S, v8.4S // .....................................................................................................*...... + // gap // ............................................................................................................ + str q15, [x0, #880] // .................................................................................*.......................... + // gap // ............................................................................................................ + mls v28.4S, v24.4S, v8.4S // ................................................................*........................................... + // gap // ............................................................................................................ + // gap // ............................................................................................................ + // gap // ............................................................................................................ + str q13, [x0, #240] // ..........................................................................................................*. + // gap // ............................................................................................................ + // gap // ............................................................................................................ + // gap // ............................................................................................................ + str q28, [x0, #496] // ..........................................................................*................................. + // gap // ............................................................................................................ + + // original source code + // sub v10.4S, v7.4S, v6.4S // *........................................................................................................... + // add v24.4S, v7.4S, v6.4S // ................*........................................................................................... + // sub v7.4S, v12.4S, v21.4S // .....*...................................................................................................... + // mul v13.4S, v10.4S, v1.S[2] // ...........*................................................................................................ + // sqrdmulh v10.4S, v10.4S, v1.S[3] // ...*........................................................................................................ + // sub v6.4S, v24.4S, v19.4S // ......................................*..................................................................... + // add v24.4S, v24.4S, v19.4S // ........................*................................................................................... + // mul v12.4S, v7.4S, v2.S[0] // ..........*................................................................................................. + // sqrdmulh v7.4S, v7.4S, v2.S[1] // .........*.................................................................................................. + // mls v13.4S, v10.4S, v8.S[0] // ...............*............................................................................................ + // sub v10.4S, v17.4S, v28.4S // ..*......................................................................................................... + // mul v17.4S, v6.4S, v0.S[2] // ....................................................*....................................................... + // sqrdmulh v6.4S, v6.4S, v0.S[3] // ..........................................*................................................................. + // sub v28.4S, v24.4S, v4.4S // .....................................................................................*...................... + // add v24.4S, v24.4S, v4.4S // ............................*............................................................................... + // mls v12.4S, v7.4S, v8.S[0] // ..............*............................................................................................. + // mul v7.4S, v10.4S, v2.S[2] // ......*..................................................................................................... + // sqrdmulh v10.4S, v10.4S, v2.S[3] // .......*.................................................................................................... + // sub v27.4S, v27.4S, v15.4S // .*.......................................................................................................... + // sub v15.4S, v13.4S, v12.4S // ...................*........................................................................................ + // add v13.4S, v13.4S, v12.4S // ..........................*................................................................................. + // mls v7.4S, v10.4S, v8.S[0] // .............*.............................................................................................. + // mul v10.4S, v27.4S, v3.S[0] // ....*....................................................................................................... + // mls v17.4S, v6.4S, v8.S[0] // ........................................................*................................................... + // sqrdmulh v6.4S, v27.4S, v3.S[1] // ........*................................................................................................... + // mul v12.4S, v15.4S, v0.S[2] // .......................*.................................................................................... + // sqrdmulh v27.4S, v15.4S, v0.S[3] // ......................*..................................................................................... + // mul v15.4S, v28.4S, v0.S[0] // ..........................................................................................*................. + // sqrdmulh v28.4S, v28.4S, v0.S[1] // ........................................................................................*................... + // mul v21.4S, v24.4S, v25.4S // ....................................*....................................................................... + // sqrdmulh v24.4S, v24.4S, v26.4S // .....................................*...................................................................... + // mls v10.4S, v6.4S, v8.S[0] // ............*............................................................................................... + // mls v12.4S, v27.4S, v8.S[0] // ...........................*................................................................................ + // sub v6.4S, v18.4S, v11.4S // ..............................*............................................................................. + // mls v15.4S, v28.4S, v8.S[0] // ...............................................................................................*............ + // sub v28.4S, v7.4S, v10.4S // .................*.......................................................................................... + // mul v27.4S, v6.4S, v1.S[0] // ...............................................*............................................................ + // sqrdmulh v6.4S, v6.4S, v1.S[1] // ........................................*................................................................... + // add v10.4S, v7.4S, v10.4S // ..................*......................................................................................... + // mul v7.4S, v28.4S, v1.S[0] // .....................*...................................................................................... + // sqrdmulh v28.4S, v28.4S, v1.S[1] // ....................*....................................................................................... + // sub v18.4S, v13.4S, v10.4S // ............................................................................*............................... + // add v10.4S, v13.4S, v10.4S // .............................*.............................................................................. + // mls v27.4S, v6.4S, v8.S[0] // ...................................................*........................................................ + // mls v7.4S, v28.4S, v8.S[0] // .........................*.................................................................................. + // mul v13.4S, v18.4S, v0.S[0] // ..................................................................................*......................... + // sqrdmulh v6.4S, v18.4S, v0.S[1] // .................................................................................*.......................... + // sub v28.4S, v17.4S, v27.4S // .............................................................*.............................................. + // add v17.4S, v17.4S, v27.4S // ............................................................*............................................... + // sub v27.4S, v12.4S, v7.4S // .........................................................*.................................................. + // mls v13.4S, v6.4S, v8.S[0] // .......................................................................................*.................... + // mul v6.4S, v28.4S, v0.S[0] // ..........................................................................*................................. + // sqrdmulh v28.4S, v28.4S, v0.S[1] // ......................................................................*..................................... + // add v7.4S, v12.4S, v7.4S // ...............................*............................................................................ + // mul v12.4S, v27.4S, v0.S[0] // ...................................................................*........................................ + // sqrdmulh v27.4S, v27.4S, v0.S[1] // ..................................................................*......................................... + // mls v6.4S, v28.4S, v8.S[0] // ...............................................................................*............................ + // cmge v28.4S, v31.4S, v15.4S // ...................................................................................................*........ + // cmge v18.4S, v15.4S, v30.4S // ....................................................................................................*....... + // mls v21.4S, v24.4S, v8.S[0] // .........................................*.................................................................. + // mls v12.4S, v27.4S, v8.S[0] // .......................................................................*.................................... + // sub v24.4S, v28.4S, v18.4S // ......................................................................................................*..... + // cmge v28.4S, v31.4S, v13.4S // ............................................................................................*............... + // cmge v27.4S, v13.4S, v30.4S // ...........................................................................................*................ + // mls v15.4S, v24.4S, v8.4S // .........................................................................................................*.. + // sub v24.4S, v28.4S, v27.4S // ..............................................................................................*............. + // cmge v28.4S, v31.4S, v6.4S // ...................................................................................*........................ + // cmge v27.4S, v6.4S, v30.4S // ....................................................................................*....................... + // mls v13.4S, v24.4S, v8.4S // .................................................................................................*.......... + // sub v24.4S, v28.4S, v27.4S // ......................................................................................*..................... + // cmge v28.4S, v31.4S, v12.4S // .............................................................................*.............................. + // cmge v27.4S, v12.4S, v30.4S // ..............................................................................*............................. + // mls v6.4S, v24.4S, v8.4S // .........................................................................................*.................. + // sub v24.4S, v28.4S, v27.4S // ................................................................................*........................... + // str q15, [x0, #512] // ...........................................................................................................* + // mul v28.4S, v10.4S, v25.4S // .................................*.......................................................................... + // mls v12.4S, v24.4S, v8.4S // ..................................................................................................*......... + // str q13, [x0, #640] // .....................................................................................................*...... + // sqrdmulh v10.4S, v10.4S, v26.4S // ................................*........................................................................... + // str q6, [x0, #768] // .............................................................................................*.............. + // mul v24.4S, v17.4S, v25.4S // ...............................................................*............................................ + // str q12, [x0, #896] // ........................................................................................................*... + // mls v28.4S, v10.4S, v8.S[0] // .......................................................*.................................................... + // sqrdmulh v10.4S, v17.4S, v26.4S // ................................................................*........................................... + // mul v13.4S, v7.4S, v25.4S // ...................................*........................................................................ + // sqrdmulh v7.4S, v7.4S, v26.4S // ..................................*......................................................................... + // cmge v6.4S, v31.4S, v21.4S // .............................................*.............................................................. + // mls v24.4S, v10.4S, v8.S[0] // ....................................................................*....................................... + // cmge v10.4S, v21.4S, v30.4S // ................................................*........................................................... + // mls v13.4S, v7.4S, v8.S[0] // .......................................*.................................................................... + // sub v10.4S, v6.4S, v10.4S // ..................................................*......................................................... + // cmge v7.4S, v31.4S, v28.4S // ...........................................................*................................................ + // cmge v6.4S, v28.4S, v30.4S // ..............................................................*............................................. + // mls v21.4S, v10.4S, v8.4S // ......................................................*..................................................... + // sub v10.4S, v7.4S, v6.4S // .................................................................*.......................................... + // cmge v7.4S, v31.4S, v24.4S // ........................................................................*................................... + // cmge v6.4S, v24.4S, v30.4S // .........................................................................*.................................. + // mls v28.4S, v10.4S, v8.4S // .....................................................................*...................................... + // sub v10.4S, v7.4S, v6.4S // ...........................................................................*................................ + // cmge v7.4S, v31.4S, v13.4S // ............................................*............................................................... + // cmge v6.4S, v13.4S, v30.4S // ...........................................*................................................................ + // mls v24.4S, v10.4S, v8.4S // .......................................................................................................*.... + // sub v10.4S, v7.4S, v6.4S // ..............................................*............................................................. + // str q21, [x0], #(16) // ..........................................................*................................................. + // mls v13.4S, v10.4S, v8.4S // .................................................*.......................................................... + // str q28, [x0, #112] // ................................................................................................*........... + // str q24, [x0, #240] // ..........................................................................................................*. + // str q13, [x0, #368] // .....................................................*...................................................... + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a72.s b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a72.s new file mode 100644 index 00000000..e58113ab --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_a72.s @@ -0,0 +1,2629 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, consts +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_manual_ld4_opt_a72 + .global _intt_dilithium_123_45678_manual_ld4_opt_a72 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_manual_ld4_opt_a72: +_intt_dilithium_123_45678_manual_ld4_opt_a72: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + // Instructions: 30 + // Expected cycles: 23 + // Expected IPC: 1.30 + // + // Wall time: 0.38s + // User time: 0.38s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q23, [x1, #0] // .....*........................ + ldr q2, [x1, #16] // ......*....................... + // gap // .............................. + ldr q11, [x1, #32] // .......*...................... + ldr q27, [x1, #48] // ........*..................... + // gap // .............................. + ldr q25, [x5, #32] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q12, [x5, #48] // ..*........................... + // gap // .............................. + // gap // .............................. + trn2 v10.4S, v23.4S, v2.4S // ..........*................... + trn1 v23.4S, v23.4S, v2.4S // .........*.................... + ldr q30, [x5, #16] // ....*......................... + trn2 v18.4S, v11.4S, v27.4S // ............*................. + trn1 v11.4S, v11.4S, v27.4S // ...........*.................. + ldr q0, [x2, #48] // ..........................*... + ldr q17, [x2, #16] // ............................*. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q13, [x5, #80] // *............................. + trn2 v28.2D, v10.2D, v18.2D // ..............*............... + trn2 v7.2D, v23.2D, v11.2D // .............*................ + // gap // .............................. + trn1 v11.2D, v23.2D, v11.2D // ...............*.............. + // gap // .............................. + // gap // .............................. + trn1 v27.2D, v10.2D, v18.2D // ................*............. + ldr q18, [x2, #32] // .........................*.... + // gap // .............................. + // gap // .............................. + // gap // .............................. + sub v10.4S, v7.4S, v28.4S // .................*............ + add v24.4S, v7.4S, v28.4S // ..................*........... + // gap // .............................. + // gap // .............................. + add v1.4S, v11.4S, v27.4S // .....................*........ + sub v20.4S, v11.4S, v27.4S // ...................*.......... + // gap // .............................. + sqrdmulh v27.4S, v10.4S, v13.4S // ....................*......... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q7, [x5, #64] // ...*.......................... + mul v25.4S, v20.4S, v25.4S // ......................*....... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v9.4S, v20.4S, v12.4S // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mul v14.4S, v10.4S, v7.4S // ...........................*.. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v14.4S, v27.4S, v8.S[0] // .............................* + // gap // .............................. + sub v16.4S, v1.4S, v24.4S // .......................*...... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q7, [x5, #80] // .............*................. + // ldr q21, [x5, #32] // ....*.......................... + // ldr q14, [x5, #48] // .....*......................... + // ldr q29, [x5, #64] // ........................*...... + // ldr q30, [x5, #16] // ........*...................... + // ldr q25, [x1, #0] // *.............................. + // ldr q1, [x1, #16] // .*............................. + // ldr q20, [x1, #32] // ..*............................ + // ldr q15, [x1, #48] // ...*........................... + // trn1 v26.4S, v25.4S, v1.4S // .......*....................... + // trn2 v25.4S, v25.4S, v1.4S // ......*........................ + // trn1 v1.4S, v20.4S, v15.4S // ..........*.................... + // trn2 v20.4S, v20.4S, v15.4S // .........*..................... + // trn2 v27.2D, v26.2D, v1.2D // ...............*............... + // trn2 v23.2D, v25.2D, v20.2D // ..............*................ + // trn1 v1.2D, v26.2D, v1.2D // ................*.............. + // trn1 v11.2D, v25.2D, v20.2D // .................*............. + // sub v13.4S, v27.4S, v23.4S // ...................*........... + // add v24.4S, v27.4S, v23.4S // ....................*.......... + // sub v27.4S, v1.4S, v11.4S // ......................*........ + // sqrdmulh v7.4S, v13.4S, v7.4S // .......................*....... + // add v1.4S, v1.4S, v11.4S // .....................*......... + // mul v25.4S, v27.4S, v21.4S // .........................*..... + // sub v16.4S, v1.4S, v24.4S // .............................*. + // sqrdmulh v9.4S, v27.4S, v14.4S // ..........................*.... + // ldr q18, [x2, #32] // ..................*............ + // ldr q0, [x2, #48] // ...........*................... + // mul v14.4S, v13.4S, v29.4S // ...........................*... + // ldr q17, [x2, #16] // ............*.................. + // mls v14.4S, v7.4S, v8.S[0] // ............................*.. + + sub count, count, #1 +layer45678_start: + // Instructions: 174 + // Expected cycles: 129 + // Expected IPC: 1.35 + // + // Wall time: 2804.18s + // User time: 2804.18s + // + // ----------------------------------------------------------------------------- original position -----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + ldr q27, [x2, #0] // ............*................................................................................................................................................................. + trn1 v11.4S, v18.4S, v0.4S // ..................*........................................................................................................................................................... + ldr q7, [x5, #272] // .............................e................................................................................................................................................ + ldr q23, [x5], #(12*16) // ........................*..................................................................................................................................................... + mls v25.4S, v9.4S, v8.S[0] // ..................................*........................................................................................................................................... + add v24.4S, v1.4S, v24.4S // .........................................*.................................................................................................................................... + ldr q9, [x5, #-96] // ..................................................*........................................................................................................................... + ldr q13, [x5, #-80] // ...................................................*.......................................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v1.4S, v16.4S, v30.4S // ...........................................*.................................................................................................................................. + ldr q3, [x5, #-64] // ....................................................*......................................................................................................................... + ldr q20, [x5, #-48] // .....................................................*........................................................................................................................ + trn1 v21.4S, v27.4S, v17.4S // ................*............................................................................................................................................................. + ldr q15, [x5, #-32] // ......................................................*....................................................................................................................... + ldr q26, [x5, #-16] // .......................................................*...................................................................................................................... + trn2 v27.4S, v27.4S, v17.4S // .................*............................................................................................................................................................ + trn2 v18.4S, v18.4S, v0.4S // ...................*.......................................................................................................................................................... + ldr q0, [x4], #64 // ............................................................................................*................................................................................. + mul v16.4S, v16.4S, v23.4S // ..........................................*................................................................................................................................... + sub v17.4S, v25.4S, v14.4S // .............................................*................................................................................................................................ + ldr q2, [x4, #-48] // .............................................................................................*................................................................................ + ldr q4, [x4, #-32] // ..............................................................................................*............................................................................... + trn2 v5.2D, v21.2D, v11.2D // ....................*......................................................................................................................................................... + // gap // .............................................................................................................................................................................. + mls v16.4S, v1.4S, v8.S[0] // ............................................*................................................................................................................................. + trn2 v1.2D, v27.2D, v18.2D // .....................*........................................................................................................................................................ + ldr q22, [x4, #-16] // ...............................................................................................*.............................................................................. + trn1 v11.2D, v21.2D, v11.2D // ......................*....................................................................................................................................................... + ldr q21, [x5, #32] // ..........................e................................................................................................................................................... + // gap // .............................................................................................................................................................................. + mul v23.4S, v17.4S, v23.4S // ...............................................*.............................................................................................................................. + add v25.4S, v25.4S, v14.4S // ..............................................*............................................................................................................................... + ldr q14, [x5, #48] // ...........................e.................................................................................................................................................. + sub v19.4S, v5.4S, v1.4S // .............................................................*................................................................................................................ + ldr q29, [x5, #64] // ............................e................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v27.2D, v27.2D, v18.2D // .......................*...................................................................................................................................................... + sqrdmulh v18.4S, v17.4S, v30.4S // ................................................*............................................................................................................................. + ldr q30, [x5, #16] // .........................e.................................................................................................................................................... + trn1 v17.4S, v24.4S, v25.4S // ............................................................................*................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v25.4S, v24.4S, v25.4S // .............................................................................*................................................................................................ + mul v24.4S, v19.4S, v15.4S // ...............................................................*.............................................................................................................. + // gap // .............................................................................................................................................................................. + sub v15.4S, v11.4S, v27.4S // ........................................................*..................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v26.4S, v19.4S, v26.4S // ................................................................*............................................................................................................. + add v27.4S, v11.4S, v27.4S // .........................................................*.................................................................................................................... + // gap // .............................................................................................................................................................................. + add v11.4S, v5.4S, v1.4S // ..............................................................*............................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v23.4S, v18.4S, v8.S[0] // .................................................*............................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v1.4S, v15.4S, v3.4S // ..........................................................*................................................................................................................... + sub v3.4S, v27.4S, v11.4S // ..................................................................*........................................................................................................... + // gap // .............................................................................................................................................................................. + add v27.4S, v27.4S, v11.4S // ...................................................................*.......................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v11.4S, v15.4S, v20.4S // ...........................................................*.................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v20.4S, v16.4S, v23.4S // ..............................................................................*............................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v23.4S, v16.4S, v23.4S // ...............................................................................*.............................................................................................. + mls v24.4S, v26.4S, v8.S[0] // .................................................................*............................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v15.4S, v3.4S, v13.4S // .....................................................................*........................................................................................................ + trn2 v26.2D, v17.2D, v20.2D // ................................................................................*............................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v20.2D, v17.2D, v20.2D // ..................................................................................*........................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v1.4S, v11.4S, v8.S[0] // ............................................................*................................................................................................................. + trn1 v11.2D, v25.2D, v23.2D // ...................................................................................*.......................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v23.2D, v25.2D, v23.2D // .................................................................................*............................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v25.4S, v3.4S, v9.4S // ....................................................................*......................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v3.4S, v20.4S, v11.4S // ................................................................................................*............................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v25.4S, v15.4S, v8.S[0] // ......................................................................*....................................................................................................... + add v11.4S, v20.4S, v11.4S // .................................................................................................*............................................................................ + // gap // .............................................................................................................................................................................. + sub v20.4S, v1.4S, v24.4S // .......................................................................*...................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v24.4S, v1.4S, v24.4S // ........................................................................*..................................................................................................... + mul v1.4S, v3.4S, v2.S[2] // ..................................................................................................*........................................................................... + // gap // .............................................................................................................................................................................. + sub v15.4S, v26.4S, v23.4S // .....................................................................................................*........................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v9.4S, v20.4S, v9.4S // .........................................................................*.................................................................................................... + add v23.4S, v26.4S, v23.4S // ......................................................................................................*....................................................................... + // gap // .............................................................................................................................................................................. + trn1 v26.4S, v27.4S, v24.4S // ....................................................................................*......................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v20.4S, v13.4S // ..........................................................................*................................................................................................... + trn2 v27.4S, v27.4S, v24.4S // .....................................................................................*........................................................................................ + // gap // .............................................................................................................................................................................. + sub v24.4S, v11.4S, v23.4S // ....................................................................................................................*......................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v3.4S, v3.4S, v2.S[3] // ...................................................................................................*.......................................................................... + add v11.4S, v11.4S, v23.4S // .....................................................................................................................*........................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v23.4S, v15.4S, v4.S[0] // .......................................................................................................*...................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + srshr v20.4S, v11.4S, #23 // ........................................................................................................................................*..................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v9.4S, v13.4S, v8.S[0] // ...........................................................................*.................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v1.4S, v3.4S, v8.S[0] // ....................................................................................................*......................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v15.4S, v4.S[1] // ........................................................................................................*..................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v3.4S, v25.4S, v9.4S // ......................................................................................*....................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v25.4S, v25.4S, v9.4S // .......................................................................................*...................................................................................... + mul v9.4S, v24.4S, v0.S[2] // ......................................................................................................................*....................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v24.4S, v0.S[3] // .......................................................................................................................*...................................................... + trn2 v15.2D, v26.2D, v3.2D // ........................................................................................*..................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v3.2D, v26.2D, v3.2D // ..........................................................................................*................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v23.4S, v13.4S, v8.S[0] // .........................................................................................................*.................................................................... + trn1 v13.2D, v27.2D, v25.2D // ...........................................................................................*.................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v27.2D, v27.2D, v25.2D // .........................................................................................*.................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v11.4S, v20.4S, v8.4S // .........................................................................................................................................*.................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v25.4S, v3.4S, v13.4S // ..........................................................................................................*................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v9.4S, v24.4S, v8.S[0] // ........................................................................................................................*..................................................... + add v24.4S, v3.4S, v13.4S // ...........................................................................................................*.................................................................. + // gap // .............................................................................................................................................................................. + sub v13.4S, v15.4S, v27.4S // ...............................................................................................................*.............................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v27.4S, v15.4S, v27.4S // ................................................................................................................*............................................................. + mul v3.4S, v25.4S, v4.S[2] // ............................................................................................................*................................................................. + // gap // .............................................................................................................................................................................. + sub v20.4S, v1.4S, v23.4S // .........................................................................................................................*.................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v25.4S, v25.4S, v4.S[3] // .............................................................................................................*................................................................ + add v23.4S, v1.4S, v23.4S // ..........................................................................................................................*................................................... + // gap // .............................................................................................................................................................................. + sub v1.4S, v24.4S, v27.4S // ..............................................................................................................................*............................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v27.4S, v24.4S, v27.4S // ...............................................................................................................................*.............................................. + sqrdmulh v24.4S, v13.4S, v22.S[1] // ..................................................................................................................*........................................................... + // gap // .............................................................................................................................................................................. + srshr v15.4S, v23.4S, #23 // ..........................................................................................................................................*................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v13.4S, v13.4S, v22.S[0] // .................................................................................................................*............................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + srshr v26.4S, v27.4S, #23 // ............................................................................................................................................*................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v3.4S, v25.4S, v8.S[0] // ..............................................................................................................*............................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v13.4S, v24.4S, v8.S[0] // ...................................................................................................................*.......................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v25.4S, v20.4S, v0.S[2] // ...........................................................................................................................*.................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v24.4S, v1.4S, v2.S[0] // ................................................................................................................................*............................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v18.4S, v3.4S, v13.4S // ...................................................................................................................................*.......................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v1.4S, v1.4S, v2.S[1] // .................................................................................................................................*............................................ + add v13.4S, v3.4S, v13.4S // ....................................................................................................................................*......................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v3.4S, v18.4S, v2.S[0] // .....................................................................................................................................*........................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + srshr v16.4S, v13.4S, #23 // ..............................................................................................................................................*............................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v18.4S, v18.4S, v2.S[1] // ......................................................................................................................................*....................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v20.4S, v20.4S, v0.S[3] // ............................................................................................................................*................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v24.4S, v1.4S, v8.S[0] // ..................................................................................................................................*........................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v3.4S, v18.4S, v8.S[0] // .......................................................................................................................................*...................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v25.4S, v20.4S, v8.S[0] // .............................................................................................................................*................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v1.4S, v9.4S, v24.4S // ..........................................................................................................................................................*................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v24.4S, v9.4S, v24.4S // ...........................................................................................................................................................*.................. + mls v23.4S, v15.4S, v8.4S // ...........................................................................................................................................*.................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v27.4S, v26.4S, v8.4S // .............................................................................................................................................*................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q24, [x1, #32] // ......................................................................................................................................................................*....... + sub v24.4S, v25.4S, v3.4S // ...............................................................................................................................................................*.............. + // gap // .............................................................................................................................................................................. + mls v13.4S, v16.4S, v8.4S // ...............................................................................................................................................*.............................. + add v25.4S, v25.4S, v3.4S // ................................................................................................................................................................*............. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v9.4S, v1.4S, v0.S[0] // ............................................................................................................................................................*................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q25, [x1, #48] // .......................................................................................................................................................................*...... + sub v25.4S, v11.4S, v27.4S // ................................................................................................................................................*............................. + // gap // .............................................................................................................................................................................. + add v27.4S, v11.4S, v27.4S // .................................................................................................................................................*............................ + sqrdmulh v11.4S, v1.4S, v0.S[1] // .............................................................................................................................................................*................ + // gap // .............................................................................................................................................................................. + add v1.4S, v23.4S, v13.4S // ......................................................................................................................................................*....................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v23.4S, v23.4S, v13.4S // .....................................................................................................................................................*........................ + mul v13.4S, v25.4S, v0.S[0] // ..................................................................................................................................................*........................... + // gap // .............................................................................................................................................................................. + str q27, [x1], #(16*4) // ....................................................................................................................................................................*......... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v27.4S, v25.4S, v0.S[1] // ...................................................................................................................................................*.......................... + str q1, [x1, #-48] // .....................................................................................................................................................................*........ + add x1, x1, #64 // ............................................................................................................................................................................*. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q25, [x1, #0] // e............................................................................................................................................................................. + ldr q1, [x1, #16] // .e............................................................................................................................................................................ + mul v3.4S, v23.4S, v0.S[0] // .......................................................................................................................................................*...................... + ldr q20, [x1, #32] // ..e........................................................................................................................................................................... + ldr q15, [x1, #48] // ...e.......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v23.4S, v23.4S, v0.S[1] // ........................................................................................................................................................*..................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v26.4S, v25.4S, v1.4S // ....e......................................................................................................................................................................... + mul v18.4S, v24.4S, v0.S[0] // .................................................................................................................................................................*............ + // gap // .............................................................................................................................................................................. + trn2 v25.4S, v25.4S, v1.4S // .....e........................................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v24.4S, v24.4S, v0.S[1] // ..................................................................................................................................................................*........... + trn1 v1.4S, v20.4S, v15.4S // ......e....................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn2 v20.4S, v20.4S, v15.4S // .......e...................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v13.4S, v27.4S, v8.S[0] // ....................................................................................................................................................*......................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v27.2D, v26.2D, v1.2D // ........e..................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v3.4S, v23.4S, v8.S[0] // .........................................................................................................................................................*.................... + trn2 v23.2D, v25.2D, v20.2D // .........e.................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + trn1 v1.2D, v26.2D, v1.2D // ..........e................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v9.4S, v11.4S, v8.S[0] // ..............................................................................................................................................................*............... + trn1 v11.2D, v25.2D, v20.2D // ...........e.................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q13, [x2], #(16*4) // ........................................................................................................................................................................*..... + sub v13.4S, v27.4S, v23.4S // ...................................e.......................................................................................................................................... + // gap // .............................................................................................................................................................................. + mls v18.4S, v24.4S, v8.S[0] // ...................................................................................................................................................................*.......... + add v24.4S, v27.4S, v23.4S // ....................................e......................................................................................................................................... + // gap // .............................................................................................................................................................................. + str q3, [x2, #-48] // .........................................................................................................................................................................*.... + sub v27.4S, v1.4S, v11.4S // ..............................e............................................................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v7.4S, v13.4S, v7.4S // ......................................e....................................................................................................................................... + add v1.4S, v1.4S, v11.4S // ...............................e.............................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q9, [x2, #-32] // ..........................................................................................................................................................................*... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v25.4S, v27.4S, v21.4S // ................................e............................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q18, [x2, #-16] // ...........................................................................................................................................................................*.. + sub v16.4S, v1.4S, v24.4S // ........................................e..................................................................................................................................... + add x2, x2, #64 // .............................................................................................................................................................................* + sqrdmulh v9.4S, v27.4S, v14.4S // .................................e............................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q18, [x2, #32] // ..............e............................................................................................................................................................... + ldr q0, [x2, #48] // ...............e.............................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v14.4S, v13.4S, v29.4S // .....................................e........................................................................................................................................ + ldr q17, [x2, #16] // .............e................................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v14.4S, v7.4S, v8.S[0] // .......................................e...................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------ new position -------------------------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 325 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------------- + // ldr q9, [x1, #0] // ......................................................................................................................................e.....................................'.......................................................................................................................................~............................... + // ldr q10, [x1, #16] // .......................................................................................................................................e....................................'........................................................................................................................................~.............................. + // ldr q11, [x1, #32] // .........................................................................................................................................e..................................'..........................................................................................................................................~............................ + // ldr q12, [x1, #48] // ..........................................................................................................................................e.................................'...........................................................................................................................................~........................... + // trn1 v25.4s, v9.4s, v10.4s // ............................................................................................................................................e...............................'.............................................................................................................................................~......................... + // trn2 v26.4s, v9.4s, v10.4s // ..............................................................................................................................................e.............................'...............................................................................................................................................~....................... + // trn1 v27.4s, v11.4s, v12.4s // ................................................................................................................................................e...........................'.................................................................................................................................................~..................... + // trn2 v28.4s, v11.4s, v12.4s // .................................................................................................................................................e..........................'..................................................................................................................................................~.................... + // trn2 v11.2d, v25.2d, v27.2d // ...................................................................................................................................................e........................'....................................................................................................................................................~.................. + // trn2 v12.2d, v26.2d, v28.2d // .....................................................................................................................................................e......................'......................................................................................................................................................~................ + // trn1 v9.2d, v25.2d, v27.2d // ......................................................................................................................................................e.....................'.......................................................................................................................................................~............... + // trn1 v10.2d, v26.2d, v28.2d // ........................................................................................................................................................e...................'.........................................................................................................................................................~............. + // ldr q13, [x2, #0] // ............................................................................................................................................................................*....................................................................................................................................................................... + // ldr q14, [x2, #16] // ..........................................................................................................................................................................e.'....................................................................................................................................................................... + // ldr q15, [x2, #32] // .......................................................................................................................................................................e....'....................................................................................................................................................................... + // ldr q16, [x2, #48] // ........................................................................................................................................................................e...'....................................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // .........~..................................................................................................................................................................'..........*............................................................................................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ............~...............................................................................................................................................................'.............*......................................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ............................................................................................................................................................................'*...................................................................................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // .............~..............................................................................................................................................................'..............*........................................................................................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ...................~........................................................................................................................................................'....................*.................................................................................................................................................. + // trn2 v16.2d, v26.2d, v28.2d // .....................~......................................................................................................................................................'......................*................................................................................................................................................ + // trn1 v13.2d, v25.2d, v27.2d // .......................~....................................................................................................................................................'........................*.............................................................................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // ..............................~.............................................................................................................................................'...............................*....................................................................................................................................... + // ldr q0, [x5], #(12*16) // .~..........................................................................................................................................................................'..*.................................................................................................................................................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ................................e...........................................................................................................................................'.................................~..................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 2*16)] // ........................e...................................................................................................................................................'.........................~............................................................................................................................................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ...........................e................................................................................................................................................'............................~.......................................................................................................................................... + // ldr q2, [x5, #(-12*16 + 4*16)] // .............................e..............................................................................................................................................'..............................~........................................................................................................................................ + // ldr q6, [x5, #(-12*16 + 5*16)] // e...........................................................................................................................................................................'.~..................................................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ..............................................................................................................................................................e.............'...............................................................................................................................................................~....... + // add v9.4s, v9.4s, v10.4s // ................................................................................................................................................................e...........'.................................................................................................................................................................~..... + // mul v10.4s, v24.4s, v1.4s // ..................................................................................................................................................................e.........'...................................................................................................................................................................~... + // sqrdmulh v24.4s, v24.4s, v5.4s // ......................................................................................................................................................................e.....'....................................................................................................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..~.........................................................................................................................................................................'...*................................................................................................................................................................... + // sub v24.4s, v11.4s, v12.4s // ..........................................................................................................................................................e.................'...........................................................................................................................................................~........... + // add v11.4s, v11.4s, v12.4s // ............................................................................................................................................................e...............'.............................................................................................................................................................~......... + // mul v12.4s, v24.4s, v2.4s // .........................................................................................................................................................................e..'....................................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...............................................................................................................................................................e............'................................................................................................................................................................~...... + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................................................................................................e'....................................................................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ....................................................................................................................................................................e.......'.....................................................................................................................................................................~. + // add v9.4s, v9.4s, v11.4s // ...~........................................................................................................................................................................'....*.................................................................................................................................................................. + // mul v11.4s, v24.4s, v0.4s // ...............~............................................................................................................................................................'................*...................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ......~.....................................................................................................................................................................'.......*............................................................................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ....................~.......................................................................................................................................................'.....................*................................................................................................................................................. + // sub v24.4s, v10.4s, v12.4s // ................~...........................................................................................................................................................'.................*..................................................................................................................................................... + // add v10.4s, v10.4s, v12.4s // ..........................~.................................................................................................................................................'...........................*........................................................................................................................................... + // mul v12.4s, v24.4s, v0.4s // .........................~..................................................................................................................................................'..........................*............................................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................~............................................................................................................................................'................................*...................................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ........................................~...................................................................................................................................'.........................................*............................................................................................................................. + // ldr q0, [x5, #(-12*16 + 6*16)] // ....~.......................................................................................................................................................................'.....*................................................................................................................................................................. + // ldr q4, [x5, #(-12*16 + 7*16)] // .....~......................................................................................................................................................................'......*................................................................................................................................................................ + // ldr q1, [x5, #(-12*16 + 8*16)] // .......~....................................................................................................................................................................'........*.............................................................................................................................................................. + // ldr q5, [x5, #(-12*16 + 9*16)] // ........~...................................................................................................................................................................'.........*............................................................................................................................................................. + // ldr q2, [x5, #(-12*16 + 10*16)] // ..........~.................................................................................................................................................................'...........*........................................................................................................................................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ...........~................................................................................................................................................................'............*.......................................................................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ....................................~.......................................................................................................................................'.....................................*................................................................................................................................. + // add v13.4s, v13.4s, v14.4s // ......................................~.....................................................................................................................................'.......................................*............................................................................................................................... + // mul v14.4s, v24.4s, v1.4s // .........................................~..................................................................................................................................'..........................................*............................................................................................................................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ............................................~...............................................................................................................................'.............................................*......................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ...................................................~........................................................................................................................'....................................................*.................................................................................................................. + // sub v24.4s, v15.4s, v16.4s // ............................~...............................................................................................................................................'.............................*......................................................................................................................................... + // add v15.4s, v15.4s, v16.4s // .......................................~....................................................................................................................................'........................................*.............................................................................................................................. + // mul v16.4s, v24.4s, v2.4s // ...................................~........................................................................................................................................'....................................*.................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // .....................................~......................................................................................................................................'......................................*................................................................................................................................ + // mls v16.4s, v24.4s, v8.s[0] // ...............................................~............................................................................................................................'................................................*...................................................................................................................... + // sub v24.4s, v13.4s, v15.4s // ..........................................~.................................................................................................................................'...........................................*........................................................................................................................... + // add v13.4s, v13.4s, v15.4s // ...........................................~................................................................................................................................'............................................*.......................................................................................................................... + // mul v15.4s, v24.4s, v0.4s // ......................................................~.....................................................................................................................'.......................................................*............................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ................................................~...........................................................................................................................'.................................................*..................................................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ........................................................~...................................................................................................................'.........................................................*............................................................................................................. + // sub v24.4s, v14.4s, v16.4s // ..........................................................~.................................................................................................................'...........................................................*........................................................................................................... + // add v14.4s, v14.4s, v16.4s // ...........................................................~................................................................................................................'............................................................*.......................................................................................................... + // mul v16.4s, v24.4s, v0.4s // ..............................................................~.............................................................................................................'...............................................................*....................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .................................................................~..........................................................................................................'..................................................................*.................................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ........................................................................~...................................................................................................'.........................................................................*............................................................................................. + // trn1 v25.4s, v9.4s, v10.4s // .................................~..........................................................................................................................................'..................................*.................................................................................................................................... + // trn2 v26.4s, v9.4s, v10.4s // ..................................~.........................................................................................................................................'...................................*................................................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // .............................................~..............................................................................................................................'..............................................*........................................................................................................................ + // trn2 v28.4s, v11.4s, v12.4s // ..............................................~.............................................................................................................................'...............................................*....................................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // .................................................~..........................................................................................................................'..................................................*.................................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // .....................................................~......................................................................................................................'......................................................*................................................................................................................ + // trn1 v9.2d, v25.2d, v27.2d // ..................................................~.........................................................................................................................'...................................................*................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ....................................................~.......................................................................................................................'.....................................................*................................................................................................................. + // trn1 v25.4s, v13.4s, v14.4s // ................................................................~...........................................................................................................'.................................................................*..................................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..................................................................~.........................................................................................................'...................................................................*................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ...........................................................................~................................................................................................'............................................................................*.......................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ............................................................................~...............................................................................................'.............................................................................*......................................................................................... + // trn2 v15.2d, v25.2d, v27.2d // ...............................................................................~............................................................................................'................................................................................*...................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ...................................................................................~........................................................................................'....................................................................................*.................................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ................................................................................~...........................................................................................'.................................................................................*..................................................................................... + // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................~.........................................................................................'...................................................................................*................................................................................... + // ldr q0, [x4], #64 // ..............~.............................................................................................................................................................'...............*....................................................................................................................................................... + // ldr q1, [x4, #(-64 + 16)] // .................~..........................................................................................................................................................'..................*.................................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // ..................~.........................................................................................................................................................'...................*................................................................................................................................................... + // ldr q3, [x4, #(-64 + 48)] // ......................~.....................................................................................................................................................'.......................*............................................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // .......................................................~....................................................................................................................'........................................................*.............................................................................................................. + // add v9.4s, v9.4s, v10.4s // .........................................................~..................................................................................................................'..........................................................*............................................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ............................................................~...............................................................................................................'.............................................................*......................................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ....................................................................~.......................................................................................................'.....................................................................*................................................................................................. + // mls v10.4s, v24.4s, v8.s[0] // .........................................................................~..................................................................................................'..........................................................................*............................................................................................ + // sub v24.4s, v11.4s, v12.4s // .............................................................~..............................................................................................................'..............................................................*........................................................................................................ + // add v11.4s, v11.4s, v12.4s // ...............................................................~............................................................................................................'................................................................*...................................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ......................................................................~.....................................................................................................'.......................................................................*............................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................~.................................................................................................'...........................................................................*........................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................................~..........................................................................................'..................................................................................*.................................................................................... + // sub v24.4s, v13.4s, v14.4s // .....................................................................................~......................................................................................'......................................................................................*................................................................................ + // add v13.4s, v13.4s, v14.4s // .......................................................................................~....................................................................................'........................................................................................*.............................................................................. + // mul v14.4s, v24.4s, v2.s[2] // ..........................................................................................~.................................................................................'...........................................................................................*........................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ............................................................................................~...............................................................................'.............................................................................................*......................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................................~.......................................................................'.....................................................................................................*................................................................. + // sub v24.4s, v15.4s, v16.4s // ........................................................................................~...................................................................................'.........................................................................................*............................................................................. + // add v15.4s, v15.4s, v16.4s // .........................................................................................~..................................................................................'..........................................................................................*............................................................................ + // mul v16.4s, v24.4s, v3.s[0] // ..................................................................................................~.........................................................................'...................................................................................................*................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................................................................................................~...........................................................................'.................................................................................................*..................................................................... + // mls v16.4s, v24.4s, v8.s[0] // .....................................................................................................~......................................................................'......................................................................................................*................................................................ + // sub v24.4s, v9.4s, v11.4s // ...................................................................~........................................................................................................'....................................................................*.................................................................................................. + // add v9.4s, v9.4s, v11.4s // .....................................................................~......................................................................................................'......................................................................*................................................................................................ + // mul v11.4s, v24.4s, v0.s[2] // .............................................................................~..............................................................................................'..............................................................................*........................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................~.............................................................................................'...............................................................................*....................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ......................................................................................~.....................................................................................'.......................................................................................*............................................................................... + // sub v24.4s, v10.4s, v12.4s // ...........................................................................................~................................................................................'............................................................................................*.......................................................................... + // add v10.4s, v10.4s, v12.4s // .............................................................................................~..............................................................................'..............................................................................................*........................................................................ + // mul v12.4s, v24.4s, v0.s[2] // ......................................................................................................~.....................................................................'.......................................................................................................*............................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................................................~.............................................................'...............................................................................................................*....................................................... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................................................................~..........................................................'..................................................................................................................*.................................................... + // sub v24.4s, v13.4s, v15.4s // ..............................................................................................~.............................................................................'...............................................................................................*....................................................................... + // add v13.4s, v13.4s, v15.4s // ...............................................................................................~............................................................................'................................................................................................*...................................................................... + // mul v15.4s, v24.4s, v1.s[0] // .......................................................................................................~....................................................................'........................................................................................................*.............................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................~..................................................................'..........................................................................................................*............................................................ + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................~............................................................'................................................................................................................*...................................................... + // sub v24.4s, v14.4s, v16.4s // ........................................................................................................~...................................................................'.........................................................................................................*............................................................. + // add v14.4s, v14.4s, v16.4s // ..........................................................................................................~.................................................................'...........................................................................................................*........................................................... + // mul v16.4s, v24.4s, v1.s[0] // ...........................................................................................................~................................................................'............................................................................................................*.......................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................................................................................................~..............................................................'..............................................................................................................*........................................................ + // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................................~...........................................................'.................................................................................................................*..................................................... + // srshr v24.4S, v9.4S, #23 // .......................................................................~....................................................................................................'........................................................................*.............................................................................................. + // mls v9.4s, v24.4s, v8.4s // ....................................................................................~.......................................................................................'.....................................................................................*................................................................................. + // srshr v24.4S, v10.4S, #23 // .................................................................................................~..........................................................................'..................................................................................................*.................................................................... + // mls v10.4s, v24.4s, v8.4s // ....................................................................................................................~.......................................................'.....................................................................................................................*................................................. + // srshr v24.4S, v13.4S, #23 // ...................................................................................................~........................................................................'....................................................................................................*.................................................................. + // mls v13.4s, v24.4s, v8.4s // .....................................................................................................................~......................................................'......................................................................................................................*................................................ + // srshr v24.4S, v14.4S, #23 // ............................................................................................................~...............................................................'.............................................................................................................*......................................................... + // mls v14.4s, v24.4s, v8.4s // ........................................................................................................................~...................................................'.........................................................................................................................*............................................. + // sub v24.4s, v9.4s, v13.4s // ............................................................................................................................~...............................................'.............................................................................................................................*......................................... + // add v9.4s, v9.4s, v13.4s // .............................................................................................................................~..............................................'..............................................................................................................................*........................................ + // mul v13.4s, v24.4s, v0.s[0] // .................................................................................................................................~..........................................'..................................................................................................................................*.................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................~........................................'....................................................................................................................................*.................................. + // mls v13.4s, v24.4s, v8.s[0] // ..................................................................................................................................................~.........................'...................................................................................................................................................*................... + // sub v24.4s, v10.4s, v14.4s // ................................................................................................................................~...........................................'.................................................................................................................................*..................................... + // add v10.4s, v10.4s, v14.4s // ...............................................................................................................................~............................................'................................................................................................................................*...................................... + // mul v14.4s, v24.4s, v0.s[0] // ........................................................................................................................................~...................................'.........................................................................................................................................*............................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................................................~................................'............................................................................................................................................*.......................... + // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................................................................................~.......................'.....................................................................................................................................................*................. + // sub v24.4s, v11.4s, v15.4s // ..................................................................................................................~.........................................................'...................................................................................................................*................................................... + // add v11.4s, v11.4s, v15.4s // ...................................................................................................................~........................................................'....................................................................................................................*.................................................. + // mul v15.4s, v24.4s, v0.s[0] // ..........................................................................................................................~.................................................'...........................................................................................................................*........................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................................................~.............................................'...............................................................................................................................*....................................... + // mls v15.4s, v24.4s, v8.s[0] // .......................................................................................................................................................~....................'........................................................................................................................................................*.............. + // sub v24.4s, v12.4s, v16.4s // .......................................................................................................................~....................................................'........................................................................................................................*.............................................. + // add v12.4s, v12.4s, v16.4s // .........................................................................................................................~..................................................'..........................................................................................................................*............................................ + // mul v16.4s, v24.4s, v0.s[0] // .............................................................................................................................................~..............................'..............................................................................................................................................*........................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................................................................................................~............................'................................................................................................................................................*...................... + // mls v16.4s, v24.4s, v8.s[0] // ...........................................................................................................................................................~................'............................................................................................................................................................*.......... + // str q9, [x1], #(16*4) // ..................................................................................................................................~.........................................'...................................................................................................................................*................................... + // str q10, [x1, #(-16*4 + 1*16)] // ....................................................................................................................................~.......................................'.....................................................................................................................................*................................. + // str q11, [x1, #(-16*4 + 2*16)] // ......................................................................................................................~.....................................................'.......................................................................................................................*............................................... + // str q12, [x1, #(-16*4 + 3*16)] // ...........................................................................................................................~................................................'............................................................................................................................*.......................................... + // str q13, [x2], #(16*4) // .........................................................................................................................................................~..................'..........................................................................................................................................................*............ + // str q14, [x2, #(-16*4 + 1*16)] // .............................................................................................................................................................~..............'..............................................................................................................................................................*........ + // str q15, [x2, #(-16*4 + 2*16)] // .................................................................................................................................................................~..........'..................................................................................................................................................................*.... + // str q16, [x2, #(-16*4 + 3*16)] // ...................................................................................................................................................................~........'....................................................................................................................................................................*.. + // add x1, x1, #64 // .....................................................................................................................................~......................................'......................................................................................................................................*................................ + // add x2, x2, #64 // .....................................................................................................................................................................~......'......................................................................................................................................................................* + + sub count, count, #1 + cbnz count, layer45678_start + // Instructions: 144 + // Expected cycles: 124 + // Expected IPC: 1.16 + // + // Wall time: 86.82s + // User time: 86.82s + // + // -------------------------------------------------------------- original position --------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + trn1 v11.4S, v18.4S, v0.4S // .*.............................................................................................................................................. + mls v25.4S, v9.4S, v8.S[0] // ...*............................................................................................................................................ + ldr q27, [x2, #0] // *............................................................................................................................................... + ldr q5, [x4, #16] // ..................*............................................................................................................................. + add v21.4S, v1.4S, v24.4S // ....*........................................................................................................................................... + ldr q13, [x5], #(12*16) // ..*............................................................................................................................................. + trn2 v29.4S, v18.4S, v0.4S // ..............*................................................................................................................................. + ldr q20, [x5, #-96] // .....*.......................................................................................................................................... + ldr q4, [x4], #64 // ...............*................................................................................................................................ + sqrdmulh v7.4S, v16.4S, v30.4S // .......*........................................................................................................................................ + ldr q10, [x4, #-16] // .......................*........................................................................................................................ + ldr q26, [x4, #-32] // ...................*............................................................................................................................ + ldr q2, [x5, #-64] // ........*....................................................................................................................................... + trn1 v18.4S, v27.4S, v17.4S // ..........*..................................................................................................................................... + // gap // ................................................................................................................................................ + ldr q15, [x5, #-48] // .........*...................................................................................................................................... + mul v23.4S, v16.4S, v13.4S // ................*............................................................................................................................... + sub v28.4S, v25.4S, v14.4S // .................*.............................................................................................................................. + ldr q19, [x5, #-32] // ...........*.................................................................................................................................... + trn2 v24.4S, v27.4S, v17.4S // .............*.................................................................................................................................. + // gap // ................................................................................................................................................ + trn1 v12.2D, v18.2D, v11.2D // ........................*....................................................................................................................... + trn2 v11.2D, v18.2D, v11.2D // ....................*........................................................................................................................... + ldr q31, [x5, #-16] // ............*................................................................................................................................... + sqrdmulh v27.4S, v28.4S, v30.4S // .............................*.................................................................................................................. + ldr q1, [x5, #-80] // ......*......................................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v9.4S, v28.4S, v13.4S // .........................*...................................................................................................................... + // gap // ................................................................................................................................................ + trn2 v0.2D, v24.2D, v29.2D // ......................*......................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v23.4S, v7.4S, v8.S[0] // .....................*.......................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v18.4S, v11.4S, v0.4S // ...........................*.................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v9.4S, v27.4S, v8.S[0] // .....................................*.......................................................................................................... + trn1 v27.2D, v24.2D, v29.2D // ............................*................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v18.4S, v31.4S // ..................................*............................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v31.4S, v12.4S, v27.4S // .................................*.............................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v3.4S, v12.4S, v27.4S // ...................................*............................................................................................................ + mul v18.4S, v18.4S, v19.4S // ................................*............................................................................................................... + // gap // ................................................................................................................................................ + add v12.4S, v25.4S, v14.4S // ..........................*..................................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v28.4S, v31.4S, v15.4S // .........................................*...................................................................................................... + add v27.4S, v11.4S, v0.4S // ....................................*........................................................................................................... + // gap // ................................................................................................................................................ + trn1 v29.4S, v23.4S, v9.4S // ..........................................*..................................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v11.4S, v23.4S, v9.4S // ...........................................*.................................................................................................... + mul v14.4S, v31.4S, v2.4S // ......................................*......................................................................................................... + // gap // ................................................................................................................................................ + sub v0.4S, v3.4S, v27.4S // .......................................*........................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v18.4S, v16.4S, v8.S[0] // ............................................*................................................................................................... + trn2 v13.4S, v21.4S, v12.4S // ...............................*................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v22.4S, v21.4S, v12.4S // ..............................*................................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v12.4S, v0.4S, v1.4S // .............................................*.................................................................................................. + add v30.4S, v3.4S, v27.4S // ........................................*....................................................................................................... + // gap // ................................................................................................................................................ + trn1 v31.2D, v13.2D, v11.2D // .................................................*.............................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v2.2D, v22.2D, v29.2D // ...............................................*................................................................................................ + mls v14.4S, v28.4S, v8.S[0] // ................................................*............................................................................................... + // gap // ................................................................................................................................................ + trn2 v7.2D, v22.2D, v29.2D // ..............................................*................................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v25.2D, v13.2D, v11.2D // ..................................................*............................................................................................. + mul v9.4S, v0.4S, v20.4S // ...................................................*............................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v11.4S, v2.4S, v31.4S // ....................................................*........................................................................................... + mls v9.4S, v12.4S, v8.S[0] // .....................................................*.......................................................................................... + add v23.4S, v2.4S, v31.4S // ......................................................*......................................................................................... + // gap // ................................................................................................................................................ + add v28.4S, v14.4S, v18.4S // ........................................................*....................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v2.4S, v11.4S, v5.S[2] // .........................................................*...................................................................................... + add v0.4S, v7.4S, v25.4S // ............................................................*................................................................................... + // gap // ................................................................................................................................................ + sub v21.4S, v7.4S, v25.4S // ..........................................................*..................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn1 v29.4S, v30.4S, v28.4S // .............................................................*.................................................................................. + // gap // ................................................................................................................................................ + sqrdmulh v12.4S, v11.4S, v5.S[3] // .................................................................*.............................................................................. + sub v3.4S, v23.4S, v0.4S // ................................................................*............................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v17.4S, v23.4S, v0.4S // ..................................................................*............................................................................. + mul v22.4S, v21.4S, v26.S[0] // ...................................................................*............................................................................ + // gap // ................................................................................................................................................ + trn2 v11.4S, v30.4S, v28.4S // ...............................................................*................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v15.4S, v14.4S, v18.4S // .......................................................*........................................................................................ + mul v6.4S, v3.4S, v4.S[2] // ..........................................................................*..................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + srshr v27.4S, v17.4S, #23 // ....................................................................*........................................................................... + mls v2.4S, v12.4S, v8.S[0] // ......................................................................*......................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v24.4S, v21.4S, v26.S[1] // .......................................................................*........................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v16.4S, v15.4S, v1.4S // ..............................................................*................................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v30.4S, v15.4S, v20.4S // ...........................................................*.................................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v22.4S, v24.4S, v8.S[0] // ..............................................................................*................................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v30.4S, v16.4S, v8.S[0] // .....................................................................*.......................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v21.4S, v3.4S, v4.S[3] // ...........................................................................*.................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v18.4S, v2.4S, v22.4S // ........................................................................................*....................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v16.4S, v2.4S, v22.4S // ..........................................................................................*..................................................... + mls v17.4S, v27.4S, v8.4S // .................................................................................*.............................................................. + // gap // ................................................................................................................................................ + trn1 v31.4S, v9.4S, v30.4S // ........................................................................*....................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v13.4S, v18.4S, v4.S[3] // ...........................................................................................................*.................................... + trn2 v30.4S, v9.4S, v30.4S // .........................................................................*...................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + srshr v27.4S, v16.4S, #23 // ..............................................................................................*................................................. + mls v6.4S, v21.4S, v8.S[0] // ...................................................................................*............................................................ + trn2 v23.2D, v29.2D, v31.2D // ............................................................................*................................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + trn2 v24.2D, v11.2D, v30.2D // ................................................................................*............................................................... + trn1 v21.2D, v29.2D, v31.2D // .............................................................................*.................................................................. + // gap // ................................................................................................................................................ + mul v3.4S, v18.4S, v4.S[2] // ...................................................................................................*............................................ + trn1 v15.2D, v11.2D, v30.2D // ...............................................................................*................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v22.4S, v23.4S, v24.4S // ......................................................................................*......................................................... + mls v16.4S, v27.4S, v8.4S // .................................................................................................................*.............................. + sub v28.4S, v23.4S, v24.4S // .....................................................................................*.......................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v3.4S, v13.4S, v8.S[0] // ..............................................................................................................*................................. + add v13.4S, v21.4S, v15.4S // ....................................................................................*........................................................... + // gap // ................................................................................................................................................ + sub v20.4S, v21.4S, v15.4S // ..................................................................................*............................................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v30.4S, v28.4S, v10.S[1] // .............................................................................................*.................................................. + sub v14.4S, v13.4S, v22.4S // ...........................................................................................*.................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v1.4S, v28.4S, v10.S[0] // ...............................................................................................*................................................ + add v29.4S, v13.4S, v22.4S // ............................................................................................*................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v22.4S, v20.4S, v26.S[3] // .........................................................................................*...................................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v2.4S, v20.4S, v26.S[2] // .......................................................................................*........................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v1.4S, v30.4S, v8.S[0] // ..................................................................................................*............................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v2.4S, v22.4S, v8.S[0] // .................................................................................................*.............................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + srshr v21.4S, v29.4S, #23 // ................................................................................................*............................................... + sqrdmulh v31.4S, v14.4S, v5.S[1] // ......................................................................................................*......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v29.4S, v21.4S, v8.4S // ..................................................................................................................*............................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v12.4S, v2.4S, v1.4S // .....................................................................................................*.......................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v28.4S, v14.4S, v5.S[0] // ....................................................................................................*........................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v11.4S, v12.4S, v5.S[1] // ..........................................................................................................*..................................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v27.4S, v17.4S, v29.4S // ..........................................................................................................................*..................... + mul v10.4S, v12.4S, v5.S[0] // ........................................................................................................*....................................... + add v12.4S, v2.4S, v1.4S // .......................................................................................................*........................................ + // gap // ................................................................................................................................................ + sub v19.4S, v17.4S, v29.4S // .........................................................................................................................*...................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v28.4S, v31.4S, v8.S[0] // ............................................................................................................*................................... + str q27, [x1], #(16*4) // ...............................................................................................................................*................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + srshr v1.4S, v12.4S, #23 // .........................................................................................................*...................................... + mls v10.4S, v11.4S, v8.S[0] // .............................................................................................................*.................................. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v12.4S, v1.4S, v8.4S // .....................................................................................................................*.......................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sub v31.4S, v6.4S, v28.4S // ...............................................................................................................*................................ + add v27.4S, v6.4S, v28.4S // ................................................................................................................*............................... + // gap // ................................................................................................................................................ + mul v21.4S, v19.4S, v4.S[0] // ..............................................................................................................................*................. + sub v25.4S, v3.4S, v10.4S // ....................................................................................................................*........................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + add v28.4S, v3.4S, v10.4S // ......................................................................................................................*......................... + mul v30.4S, v31.4S, v4.S[0] // .......................................................................................................................*........................ + // gap // ................................................................................................................................................ + str q27, [x1, #-32] // ...................................................................................................................*............................ + // gap // ................................................................................................................................................ + add v27.4S, v16.4S, v12.4S // ............................................................................................................................*................... + sub v29.4S, v16.4S, v12.4S // .............................................................................................................................*.................. + mul v0.4S, v25.4S, v4.S[0] // .....................................................................................................................................*.......... + // gap // ................................................................................................................................................ + str q28, [x1, #-16] // ........................................................................................................................*....................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q27, [x1, #-48] // .................................................................................................................................*.............. + add x1, x1, #64 // ..................................................................................................................................*............. + sqrdmulh v12.4S, v25.4S, v4.S[1] // ......................................................................................................................................*......... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v20.4S, v29.4S, v4.S[1] // ....................................................................................................................................*........... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v1.4S, v31.4S, v4.S[1] // ...........................................................................................................................*.................... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + sqrdmulh v9.4S, v19.4S, v4.S[1] // ................................................................................................................................*............... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mul v14.4S, v29.4S, v4.S[0] // ...................................................................................................................................*............ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v14.4S, v20.4S, v8.S[0] // ........................................................................................................................................*....... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v30.4S, v1.4S, v8.S[0] // .........................................................................................................................................*...... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v21.4S, v9.4S, v8.S[0] // .......................................................................................................................................*........ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q14, [x2, #16] // ............................................................................................................................................*... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + mls v0.4S, v12.4S, v8.S[0] // ...........................................................................................................................................*.... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q30, [x2, #32] // .............................................................................................................................................*.. + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q21, [x2], #(16*4) // ..........................................................................................................................................*..... + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + // gap // ................................................................................................................................................ + str q0, [x2, #-16] // ..............................................................................................................................................*. + add x2, x2, #64 // ...............................................................................................................................................* + // gap // ................................................................................................................................................ + + // ---------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q27, [x2, #0] // ..*............................................................................................................................................. + // trn1 v11.4S, v18.4S, v0.4S // *............................................................................................................................................... + // ldr q23, [x5], #(12*16) // .....*.......................................................................................................................................... + // mls v25.4S, v9.4S, v8.S[0] // .*.............................................................................................................................................. + // add v24.4S, v1.4S, v24.4S // ....*........................................................................................................................................... + // ldr q9, [x5, #-96] // .......*........................................................................................................................................ + // ldr q13, [x5, #-80] // .......................*........................................................................................................................ + // sqrdmulh v1.4S, v16.4S, v30.4S // .........*...................................................................................................................................... + // ldr q3, [x5, #-64] // ............*................................................................................................................................... + // ldr q20, [x5, #-48] // ..............*................................................................................................................................. + // trn1 v21.4S, v27.4S, v17.4S // .............*.................................................................................................................................. + // ldr q15, [x5, #-32] // .................*.............................................................................................................................. + // ldr q26, [x5, #-16] // .....................*.......................................................................................................................... + // trn2 v27.4S, v27.4S, v17.4S // ..................*............................................................................................................................. + // trn2 v18.4S, v18.4S, v0.4S // ......*......................................................................................................................................... + // ldr q0, [x4], #64 // ........*....................................................................................................................................... + // mul v16.4S, v16.4S, v23.4S // ...............*................................................................................................................................ + // sub v17.4S, v25.4S, v14.4S // ................*............................................................................................................................... + // ldr q2, [x4, #-48] // ...*............................................................................................................................................ + // ldr q4, [x4, #-32] // ...........*.................................................................................................................................... + // trn2 v5.2D, v21.2D, v11.2D // ....................*........................................................................................................................... + // mls v16.4S, v1.4S, v8.S[0] // ..........................*..................................................................................................................... + // trn2 v1.2D, v27.2D, v18.2D // .........................*...................................................................................................................... + // ldr q22, [x4, #-16] // ..........*..................................................................................................................................... + // trn1 v11.2D, v21.2D, v11.2D // ...................*............................................................................................................................ + // mul v23.4S, v17.4S, v23.4S // ........................*....................................................................................................................... + // add v25.4S, v25.4S, v14.4S // ..................................*............................................................................................................. + // sub v19.4S, v5.4S, v1.4S // ...........................*.................................................................................................................... + // trn1 v27.2D, v27.2D, v18.2D // .............................*.................................................................................................................. + // sqrdmulh v18.4S, v17.4S, v30.4S // ......................*......................................................................................................................... + // trn1 v17.4S, v24.4S, v25.4S // ...........................................*.................................................................................................... + // trn2 v25.4S, v24.4S, v25.4S // ..........................................*..................................................................................................... + // mul v24.4S, v19.4S, v15.4S // .................................*.............................................................................................................. + // sub v15.4S, v11.4S, v27.4S // ...............................*................................................................................................................ + // sqrdmulh v26.4S, v19.4S, v26.4S // ..............................*................................................................................................................. + // add v27.4S, v11.4S, v27.4S // ................................*............................................................................................................... + // add v11.4S, v5.4S, v1.4S // ....................................*........................................................................................................... + // mls v23.4S, v18.4S, v8.S[0] // ............................*................................................................................................................... + // mul v1.4S, v15.4S, v3.4S // .......................................*........................................................................................................ + // sub v3.4S, v27.4S, v11.4S // ........................................*....................................................................................................... + // add v27.4S, v27.4S, v11.4S // .............................................*.................................................................................................. + // sqrdmulh v11.4S, v15.4S, v20.4S // ...................................*............................................................................................................ + // trn1 v20.4S, v16.4S, v23.4S // .....................................*.......................................................................................................... + // trn2 v23.4S, v16.4S, v23.4S // ......................................*......................................................................................................... + // mls v24.4S, v26.4S, v8.S[0] // .........................................*...................................................................................................... + // sqrdmulh v15.4S, v3.4S, v13.4S // ............................................*................................................................................................... + // trn2 v26.2D, v17.2D, v20.2D // .................................................*.............................................................................................. + // trn1 v20.2D, v17.2D, v20.2D // ...............................................*................................................................................................ + // mls v1.4S, v11.4S, v8.S[0] // ................................................*............................................................................................... + // trn1 v11.2D, v25.2D, v23.2D // ..............................................*................................................................................................. + // trn2 v23.2D, v25.2D, v23.2D // ..................................................*............................................................................................. + // mul v25.4S, v3.4S, v9.4S // ...................................................*............................................................................................ + // sub v3.4S, v20.4S, v11.4S // ....................................................*........................................................................................... + // mls v25.4S, v15.4S, v8.S[0] // .....................................................*.......................................................................................... + // add v11.4S, v20.4S, v11.4S // ......................................................*......................................................................................... + // sub v20.4S, v1.4S, v24.4S // .................................................................*.............................................................................. + // add v24.4S, v1.4S, v24.4S // .......................................................*........................................................................................ + // mul v1.4S, v3.4S, v2.S[2] // ........................................................*....................................................................................... + // sub v15.4S, v26.4S, v23.4S // ..........................................................*..................................................................................... + // mul v9.4S, v20.4S, v9.4S // .......................................................................*........................................................................ + // add v23.4S, v26.4S, v23.4S // .........................................................*...................................................................................... + // trn1 v26.4S, v27.4S, v24.4S // ...........................................................*.................................................................................... + // sqrdmulh v13.4S, v20.4S, v13.4S // ......................................................................*......................................................................... + // trn2 v27.4S, v27.4S, v24.4S // ................................................................*............................................................................... + // sub v24.4S, v11.4S, v23.4S // .............................................................*.................................................................................. + // sqrdmulh v3.4S, v3.4S, v2.S[3] // ............................................................*................................................................................... + // add v11.4S, v11.4S, v23.4S // ..............................................................*................................................................................. + // mul v23.4S, v15.4S, v4.S[0] // ...............................................................*................................................................................ + // srshr v20.4S, v11.4S, #23 // ...................................................................*............................................................................ + // mls v9.4S, v13.4S, v8.S[0] // .........................................................................*...................................................................... + // mls v1.4S, v3.4S, v8.S[0] // ....................................................................*........................................................................... + // sqrdmulh v13.4S, v15.4S, v4.S[1] // .....................................................................*.......................................................................... + // trn1 v3.4S, v25.4S, v9.4S // ..............................................................................*................................................................. + // trn2 v25.4S, v25.4S, v9.4S // ................................................................................*............................................................... + // mul v9.4S, v24.4S, v0.S[2] // ..................................................................*............................................................................. + // sqrdmulh v24.4S, v24.4S, v0.S[3] // ..........................................................................*..................................................................... + // trn2 v15.2D, v26.2D, v3.2D // ...................................................................................*............................................................ + // trn1 v3.2D, v26.2D, v3.2D // .....................................................................................*.......................................................... + // mls v23.4S, v13.4S, v8.S[0] // ........................................................................*....................................................................... + // trn1 v13.2D, v27.2D, v25.2D // .......................................................................................*........................................................ + // trn2 v27.2D, v27.2D, v25.2D // ....................................................................................*........................................................... + // mls v11.4S, v20.4S, v8.4S // .............................................................................*.................................................................. + // sub v25.4S, v3.4S, v13.4S // .............................................................................................*.................................................. + // mls v9.4S, v24.4S, v8.S[0] // ..................................................................................*............................................................. + // add v24.4S, v3.4S, v13.4S // ............................................................................................*................................................... + // sub v13.4S, v15.4S, v27.4S // ..........................................................................................*..................................................... + // add v27.4S, v15.4S, v27.4S // ........................................................................................*....................................................... + // mul v3.4S, v25.4S, v4.S[2] // ...................................................................................................*............................................ + // sub v20.4S, v1.4S, v23.4S // ...........................................................................*.................................................................... + // sqrdmulh v25.4S, v25.4S, v4.S[3] // ..................................................................................................*............................................. + // add v23.4S, v1.4S, v23.4S // ............................................................................*................................................................... + // sub v1.4S, v24.4S, v27.4S // ...............................................................................................*................................................ + // add v27.4S, v24.4S, v27.4S // .................................................................................................*.............................................. + // sqrdmulh v24.4S, v13.4S, v22.S[1] // ..............................................................................................*................................................. + // srshr v15.4S, v23.4S, #23 // .................................................................................*.............................................................. + // mul v13.4S, v13.4S, v22.S[0] // ................................................................................................*............................................... + // srshr v26.4S, v27.4S, #23 // ......................................................................................................*......................................... + // mls v3.4S, v25.4S, v8.S[0] // .....................................................................................................*.......................................... + // mls v13.4S, v24.4S, v8.S[0] // ....................................................................................................*........................................... + // mul v25.4S, v20.4S, v0.S[2] // ......................................................................................*......................................................... + // mul v24.4S, v1.4S, v2.S[0] // ..........................................................................................................*..................................... + // sub v18.4S, v3.4S, v13.4S // .........................................................................................................*...................................... + // sqrdmulh v1.4S, v1.4S, v2.S[1] // .......................................................................................................*........................................ + // add v13.4S, v3.4S, v13.4S // ..............................................................................................................*................................. + // mul v3.4S, v18.4S, v2.S[0] // .............................................................................................................*.................................. + // srshr v16.4S, v13.4S, #23 // ..................................................................................................................*............................. + // sqrdmulh v18.4S, v18.4S, v2.S[1] // ...........................................................................................................*.................................... + // sqrdmulh v20.4S, v20.4S, v0.S[3] // ...............................................................................*................................................................ + // mls v24.4S, v1.4S, v8.S[0] // ................................................................................................................*............................... + // mls v3.4S, v18.4S, v8.S[0] // ...................................................................................................................*............................ + // mls v25.4S, v20.4S, v8.S[0] // ...........................................................................................*.................................................... + // sub v1.4S, v9.4S, v24.4S // .....................................................................................................................*.......................... + // add v24.4S, v9.4S, v24.4S // ......................................................................................................................*......................... + // mls v23.4S, v15.4S, v8.4S // .........................................................................................*...................................................... + // mls v27.4S, v26.4S, v8.4S // ........................................................................................................*....................................... + // str q24, [x1, #32] // ...........................................................................................................................*.................... + // sub v24.4S, v25.4S, v3.4S // ........................................................................................................................*....................... + // mls v13.4S, v16.4S, v8.4S // ....................................................................................................................*........................... + // add v25.4S, v25.4S, v3.4S // .........................................................................................................................*...................... + // mul v9.4S, v1.4S, v0.S[0] // ..........................................................................................................................*..................... + // str q25, [x1, #48] // ...............................................................................................................................*................ + // sub v25.4S, v11.4S, v27.4S // ...............................................................................................................*................................ + // add v27.4S, v11.4S, v27.4S // ............................................................................................................*................................... + // sqrdmulh v11.4S, v1.4S, v0.S[1] // ....................................................................................................................................*........... + // add v1.4S, v23.4S, v13.4S // ............................................................................................................................*................... + // sub v23.4S, v23.4S, v13.4S // .............................................................................................................................*.................. + // mul v13.4S, v25.4S, v0.S[0] // .......................................................................................................................*........................ + // str q27, [x1], #(16*4) // .................................................................................................................*.............................. + // sqrdmulh v27.4S, v25.4S, v0.S[1] // .....................................................................................................................................*.......... + // str q1, [x1, #-48] // ................................................................................................................................*............... + // add x1, x1, #64 // .................................................................................................................................*.............. + // mul v3.4S, v23.4S, v0.S[0] // ......................................................................................................................................*......... + // sqrdmulh v23.4S, v23.4S, v0.S[1] // ...................................................................................................................................*............ + // mul v18.4S, v24.4S, v0.S[0] // ..............................................................................................................................*................. + // sqrdmulh v24.4S, v24.4S, v0.S[1] // ..................................................................................................................................*............. + // mls v13.4S, v27.4S, v8.S[0] // .........................................................................................................................................*...... + // mls v3.4S, v23.4S, v8.S[0] // .......................................................................................................................................*........ + // mls v9.4S, v11.4S, v8.S[0] // ........................................................................................................................................*....... + // str q13, [x2], #(16*4) // .............................................................................................................................................*.. + // mls v18.4S, v24.4S, v8.S[0] // ...........................................................................................................................................*.... + // str q3, [x2, #-48] // ..........................................................................................................................................*..... + // str q9, [x2, #-32] // ............................................................................................................................................*... + // str q18, [x2, #-16] // ..............................................................................................................................................*. + // add x2, x2, #64 // ...............................................................................................................................................* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + // Instructions: 13 + // Expected cycles: 14 + // Expected IPC: 0.93 + // + // Wall time: 0.04s + // User time: 0.04s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + ldr q11, [x0, #768] // *............................. + ldr q4, [x0, #896] // .....*........................ + // gap // .............................. + ldr q13, [x0, #256] // .*............................ + // gap // .............................. + // gap // .............................. + ldr q20, [x0, #384] // ..*........................... + // gap // .............................. + // gap // .............................. + ldr q18, [x0, #512] // ...*.......................... + // gap // .............................. + // gap // .............................. + sub v28.4S, v11.4S, v4.4S // ......*....................... + ldr q16, [x0, #640] // ....*......................... + // gap // .............................. + add v14.4S, v11.4S, v4.4S // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v6.4S, v28.4S, v3.S[1] // .........*.................... + // gap // .............................. + // gap // .............................. + sub v5.4S, v18.4S, v16.4S // ........*..................... + // gap // .............................. + // gap // .............................. + mul v19.4S, v28.4S, v3.S[0] // ..........*................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + sqrdmulh v4.4S, v5.4S, v2.S[3] // ...........*.................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v19.4S, v6.4S, v8.S[0] // ............*................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q21, [x0, #768] // *.............................. + // ldr q13, [x0, #256] // ..*............................ + // ldr q20, [x0, #384] // ...*........................... + // ldr q18, [x0, #512] // ....*.......................... + // ldr q16, [x0, #640] // ......*........................ + // ldr q17, [x0, #896] // .*............................. + // sub v22.4S, v21.4S, v17.4S // .....*......................... + // add v14.4S, v21.4S, v17.4S // .......*....................... + // sub v5.4S, v18.4S, v16.4S // .........*..................... + // sqrdmulh v24.4S, v22.4S, v3.S[1] // ........*...................... + // mul v19.4S, v22.4S, v3.S[0] // ..........*.................... + // sqrdmulh v4.4S, v5.4S, v2.S[3] // ...........*................... + // mls v19.4S, v24.4S, v8.S[0] // ............*.................. + + sub count, count, #1 +layer123_start: + // Instructions: 120 + // Expected cycles: 112 + // Expected IPC: 1.07 + // + // Wall time: 13.14s + // User time: 13.14s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- + ldr q27, [x0, #0] // *....................................................................................................................... + ldr q11, [x0, #128] // .*...................................................................................................................... + sub v7.4S, v13.4S, v20.4S // .............*.......................................................................................................... + mul v24.4S, v5.4S, v2.S[2] // ....................*................................................................................................... + add v23.4S, v13.4S, v20.4S // ..............*......................................................................................................... + ldr q21, [x0, #784] // ......e................................................................................................................. + add v9.4S, v18.4S, v16.4S // ...................*.................................................................................................... + ldr q13, [x0, #272] // ..e..................................................................................................................... + ldr q20, [x0, #400] // ...e.................................................................................................................... + mul v15.4S, v7.4S, v2.S[0] // ...............*........................................................................................................ + ldr q18, [x0, #528] // ....e................................................................................................................... + ldr q16, [x0, #656] // .....e.................................................................................................................. + ldr q17, [x0, #912] // .......e................................................................................................................ + sub v5.4S, v27.4S, v11.4S // ........*............................................................................................................... + // gap // ........................................................................................................................ + add v27.4S, v27.4S, v11.4S // .........*.............................................................................................................. + sqrdmulh v11.4S, v7.4S, v2.S[1] // ................*....................................................................................................... + // gap // ........................................................................................................................ + sub v7.4S, v9.4S, v14.4S // ......................................*................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v24.4S, v4.4S, v8.S[0] // ......................*................................................................................................. + add v9.4S, v9.4S, v14.4S // .......................................*................................................................................ + // gap // ........................................................................................................................ + sub v4.4S, v27.4S, v23.4S // ............................*........................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v27.4S, v27.4S, v23.4S // .............................*.......................................................................................... + mul v23.4S, v5.4S, v1.S[2] // ..........*............................................................................................................. + // gap // ........................................................................................................................ + sub v22.4S, v21.4S, v17.4S // .......................e................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v14.4S, v21.4S, v17.4S // ........................e............................................................................................... + mls v15.4S, v11.4S, v8.S[0] // .................*...................................................................................................... + // gap // ........................................................................................................................ + sub v11.4S, v24.4S, v19.4S // ...........................................*............................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v21.4S, v5.4S, v1.S[3] // ...........*............................................................................................................ + add v24.4S, v24.4S, v19.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + sub v17.4S, v27.4S, v9.4S // ................................................*....................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v27.4S, v27.4S, v9.4S // .................................................*...................................................................... + mul v9.4S, v4.4S, v0.S[2] // ..............................*......................................................................................... + // gap // ........................................................................................................................ + sub v5.4S, v18.4S, v16.4S // ..................e..................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v4.4S, v4.4S, v0.S[3] // ...............................*........................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v19.4S, v7.4S, v1.S[0] // ........................................*............................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*.............................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v23.4S, v21.4S, v8.S[0] // ............*........................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v9.4S, v4.4S, v8.S[0] // ................................*....................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v7.4S, v8.S[0] // ..........................................*............................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v7.4S, v23.4S, v15.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v23.4S, v23.4S, v15.4S // ..................................*..................................................................................... + mul v21.4S, v11.4S, v1.S[0] // .............................................*.......................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v15.4S, v7.4S, v0.S[2] // ...................................*.................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v4.4S, v23.4S, v24.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v23.4S, v23.4S, v24.4S // ......................................................*................................................................. + sqrdmulh v7.4S, v7.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + sub v24.4S, v9.4S, v19.4S // ..........................................................*............................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v11.4S, v11.4S, v1.S[1] // ..............................................*......................................................................... + add v9.4S, v9.4S, v19.4S // ...........................................................*............................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v19.4S, v17.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v15.4S, v7.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v11.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v11.4S, v17.4S, v0.S[1] // ...................................................*.................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v7.4S, v4.4S, v0.S[0] // .......................................................*................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v17.4S, v15.4S, v21.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v21.4S, v15.4S, v21.4S // ................................................................*....................................................... + mul v15.4S, v27.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v27.4S, v27.4S, v26.4S // .........................................................................................*.............................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v11.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v11.4S, v4.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v4.4S, v24.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v29.4S, v31.4S, v19.4S // ....................................................................*................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v24.4S, v24.4S, v0.S[1] // .............................................................*.......................................................... + cmge v6.4S, v19.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v7.4S, v11.4S, v8.S[0] // .........................................................*.............................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v11.4S, v29.4S, v6.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v29.4S, v17.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v4.4S, v24.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v24.4S, v31.4S, v7.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v17.4S, v0.S[1] // ..................................................................*..................................................... + cmge v6.4S, v7.4S, v30.4S // .........................................................................*.............................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v11.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v11.4S, v24.4S, v6.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v15.4S, v27.4S, v8.S[0] // ..........................................................................................*............................. + cmge v27.4S, v31.4S, v4.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + cmge v24.4S, v4.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v29.4S, v17.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q19, [x0, #512] // ....................................................................................*................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v7.4S, v11.4S, v8.4S // ...........................................................................*............................................ + sub v27.4S, v27.4S, v24.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + cmge v11.4S, v31.4S, v15.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v24.4S, v15.4S, v30.4S // .....................................................................................................*.................. + mul v17.4S, v23.4S, v25.4S // ...........................................................................................*............................ + // gap // ........................................................................................................................ + cmge v19.4S, v31.4S, v29.4S // ................................................................................*....................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v4.4S, v27.4S, v8.4S // ...............................................................................*........................................ + cmge v27.4S, v29.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + str q7, [x0, #640] // .....................................................................................*.................................. + sub v11.4S, v11.4S, v24.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v23.4S, v26.4S // ............................................................................................*........................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v27.4S, v19.4S, v27.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v23.4S, v9.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q4, [x0, #768] // ......................................................................................*................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v24.4S, v9.4S, v26.4S // ...............................................................................................*........................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v17.4S, v7.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v21.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v23.4S, v24.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v24.4S, v31.4S, v17.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v9.4S, v21.4S, v25.4S // .................................................................................................*...................... + cmge v21.4S, v17.4S, v30.4S // .........................................................................................................*.............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v9.4S, v7.4S, v8.S[0] // ...................................................................................................*.................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v7.4S, v24.4S, v21.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v29.4S, v27.4S, v8.4S // ...................................................................................*.................................... + cmge v27.4S, v31.4S, v23.4S // ............................................................................................................*........... + // gap // ........................................................................................................................ + cmge v24.4S, v23.4S, v30.4S // .............................................................................................................*.......... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v15.4S, v11.4S, v8.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v11.4S, v31.4S, v9.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v17.4S, v7.4S, v8.4S // ...........................................................................................................*............ + cmge v7.4S, v9.4S, v30.4S // .................................................................................................................*...... + // gap // ........................................................................................................................ + str q29, [x0, #896] // .......................................................................................*................................ + sub v27.4S, v27.4S, v24.4S // ..............................................................................................................*......... + // gap // ........................................................................................................................ + sqrdmulh v24.4S, v22.4S, v3.S[1] // ..........................e............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q15, [x0], #(16) // ....................................................................................................................*... + sub v11.4S, v11.4S, v7.4S // ..................................................................................................................*..... + // gap // ........................................................................................................................ + mls v23.4S, v27.4S, v8.4S // ...............................................................................................................*........ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q17, [x0, #112] // .....................................................................................................................*.. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v9.4S, v11.4S, v8.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v19.4S, v22.4S, v3.S[0] // .........................e.............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q23, [x0, #240] // ......................................................................................................................*. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v4.4S, v5.4S, v2.S[3] // .....................e.................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q9, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v24.4S, v8.S[0] // ...........................e............................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + + // ------------------------------------------------------------------------------------------------------------- new position --------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------- + // ldr q9, [x0, #0] // ...................................................................................................................*...................................................................................................................... + // ldr q10, [x0, #(1*(1024/8))] // ...................................................................................................................'*..................................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ..e................................................................................................................'......~............................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ...e...............................................................................................................'.......~.............................................................................................................. + // ldr q13, [x0, #(4*(1024/8))] // .....e.............................................................................................................'.........~............................................................................................................ + // ldr q14, [x0, #(5*(1024/8))] // ......e............................................................................................................'..........~........................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // e..................................................................................................................'....~................................................................................................................. + // ldr q16, [x0, #(7*(1024/8))] // .......e...........................................................................................................'...........~.......................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ........~..........................................................................................................'............*......................................................................................................... + // add v9.4s, v9.4s, v10.4s // .........~.........................................................................................................'.............*........................................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ................~..................................................................................................'....................*................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................~.............................................................................................'.........................*............................................................................................ + // mls v10.4s, v24.4s, v8.s[0] // ..............................~....................................................................................'..................................*................................................................................... + // sub v24.4s, v11.4s, v12.4s // ...................................................................................................................'.*.................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................................................................................................'...*.................................................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ....~..............................................................................................................'........*............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........~........................................................................................................'..............*....................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................~...............................................................................................'.......................*.............................................................................................. + // sub v24.4s, v13.4s, v14.4s // ..........................e........................................................................................'..............................~....................................................................................... + // add v13.4s, v13.4s, v14.4s // .~.................................................................................................................'.....*................................................................................................................ + // mul v14.4s, v24.4s, v2.s[2] // ...................................................................................................................'..*................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................e..'....................................................................................................................~. + // mls v14.4s, v24.4s, v8.s[0] // ............~......................................................................................................'................*..................................................................................................... + // sub v24.4s, v15.4s, v16.4s // .................e.................................................................................................'.....................~................................................................................................ + // add v15.4s, v15.4s, v16.4s // ..................e................................................................................................'......................~............................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ..............................................................................................................e....'..................................................................................................................~... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................e..........'............................................................................................................~......... + // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................e'...................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ..............~....................................................................................................'..................*................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...............~...................................................................................................'...................*.................................................................................................. + // mul v11.4s, v24.4s, v0.s[2] // .........................~.........................................................................................'.............................*........................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................~.......................................................................................'...............................*...................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ...............................~...................................................................................'...................................*.................................................................................. + // sub v24.4s, v10.4s, v12.4s // .................................~.................................................................................'.....................................*................................................................................ + // add v10.4s, v10.4s, v12.4s // ..................................~................................................................................'......................................*............................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ....................................~..............................................................................'........................................*............................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................~...........................................................................'...........................................*.......................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ............................................~......................................................................'................................................*..................................................................... + // sub v24.4s, v13.4s, v15.4s // ...........~.......................................................................................................'...............*...................................................................................................... + // add v13.4s, v13.4s, v15.4s // .............~.....................................................................................................'.................*.................................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ............................~......................................................................................'................................*..................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................~.....................................................................................'.................................*.................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ................................~..................................................................................'....................................*................................................................................. + // sub v24.4s, v14.4s, v16.4s // ....................~..............................................................................................'........................*............................................................................................. + // add v14.4s, v14.4s, v16.4s // ......................~............................................................................................'..........................*........................................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ...................................~...............................................................................'.......................................*.............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................~.........................................................................'.............................................*........................................................................ + // mls v16.4s, v24.4s, v8.s[0] // .............................................~.....................................................................'.................................................*.................................................................... + // sub v24.4s, v9.4s, v13.4s // .......................~...........................................................................................'...........................*.......................................................................................... + // add v9.4s, v9.4s, v13.4s // ........................~..........................................................................................'............................*......................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ...........................................~.......................................................................'...............................................*...................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................~....................................................................'..................................................*................................................................... + // mls v13.4s, v24.4s, v8.s[0] // ....................................................~..............................................................'........................................................*............................................................. + // sub v24.4s, v10.4s, v14.4s // .....................................~.............................................................................'.........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ......................................~............................................................................'..........................................*........................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ...............................................~...................................................................'...................................................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................~.............................................................'.........................................................*............................................................ + // mls v14.4s, v24.4s, v8.s[0] // ..........................................................~........................................................'..............................................................*....................................................... + // sub v24.4s, v11.4s, v15.4s // ........................................~..........................................................................'............................................*......................................................................... + // add v11.4s, v11.4s, v15.4s // ..........................................~........................................................................'..............................................*....................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ......................................................~............................................................'..........................................................*........................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................~..........................................................'............................................................*......................................................... + // mls v15.4s, v24.4s, v8.s[0] // .............................................................~.....................................................'.................................................................*.................................................... + // sub v24.4s, v12.4s, v16.4s // ................................................~..................................................................'....................................................*................................................................. + // add v12.4s, v12.4s, v16.4s // .................................................~.................................................................'.....................................................*................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ............................................................~......................................................'................................................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................~...................................................'...................................................................*.................................................. + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................~............................................'..........................................................................*........................................... + // cmge v27.4s, v31.4s, v13.4s // .......................................................~...........................................................'...........................................................*.......................................................... + // cmge v28.4s, v13.4s, v30.4s // .........................................................~.........................................................'.............................................................*........................................................ + // sub v28.4s, v27.4s, v28.4s // ...........................................................~.......................................................'...............................................................*...................................................... + // mls v13.4s, v28.4s, v8.4s // .................................................................~.................................................'.....................................................................*................................................ + // cmge v27.4s, v31.4s, v14.4s // ..............................................................~....................................................'..................................................................*................................................... + // cmge v28.4s, v14.4s, v30.4s // ................................................................~..................................................'....................................................................*................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................~................................................'......................................................................*............................................... + // mls v14.4s, v28.4s, v8.4s // ........................................................................~..........................................'............................................................................*......................................... + // cmge v27.4s, v31.4s, v15.4s // ....................................................................~..............................................'........................................................................*............................................. + // cmge v28.4s, v15.4s, v30.4s // .....................................................................~.............................................'.........................................................................*............................................ + // sub v28.4s, v27.4s, v28.4s // .........................................................................~.........................................'.............................................................................*........................................ + // mls v15.4s, v28.4s, v8.4s // ..............................................................................~....................................'..................................................................................*................................... + // cmge v27.4s, v31.4s, v16.4s // .............................................................................~.....................................'.................................................................................*.................................... + // cmge v28.4s, v16.4s, v30.4s // ...............................................................................~...................................'...................................................................................*.................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................................~...............................'.......................................................................................*.............................. + // mls v16.4s, v28.4s, v8.4s // ...............................................................................................~...................'...................................................................................................*.................. + // str q13, [x0, #(4*(1024/8))] // .......................................................................~...........................................'...........................................................................*.......................................... + // str q14, [x0, #(5*(1024/8))] // ................................................................................~..................................'....................................................................................*................................. + // str q15, [x0, #(6*(1024/8))] // .....................................................................................~.............................'.........................................................................................*............................ + // str q16, [x0, #(7*(1024/8))] // ......................................................................................................~............'..........................................................................................................*........... + // mul v13.4s, v9.4s, v25.4s // ..................................................~................................................................'......................................................*............................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................~...............................................................'.......................................................*.............................................................. + // mls v13.4s, v9.4s, v8.s[0] // ...................................................................~...............................................'.......................................................................*.............................................. + // mul v14.4s, v10.4s, v25.4s // ............................................................................~......................................'................................................................................*..................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................~................................'......................................................................................*............................... + // mls v14.4s, v10.4s, v8.s[0] // .......................................................................................~...........................'...........................................................................................*.......................... + // mul v15.4s, v11.4s, v25.4s // ....................................................................................~..............................'........................................................................................*............................. + // sqrdmulh v11.4s, v11.4s, v26.4s // ......................................................................................~............................'..........................................................................................*........................... + // mls v15.4s, v11.4s, v8.s[0] // .........................................................................................~.........................'.............................................................................................*........................ + // mul v16.4s, v12.4s, v25.4s // ...........................................................................................~.......................'...............................................................................................*...................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ........................................................................................~..........................'............................................................................................*......................... + // mls v16.4s, v12.4s, v8.s[0] // .............................................................................................~.....................'.................................................................................................*.................... + // cmge v27.4s, v31.4s, v13.4s // ..........................................................................~........................................'..............................................................................*....................................... + // cmge v28.4s, v13.4s, v30.4s // ...........................................................................~.......................................'...............................................................................*...................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................~.................................'.....................................................................................*................................ + // mls v13.4s, v28.4s, v8.4s // ..................................................................................................~................'......................................................................................................*............... + // cmge v27.4s, v31.4s, v14.4s // ..........................................................................................~........................'..............................................................................................*....................... + // cmge v28.4s, v14.4s, v30.4s // ............................................................................................~......................'................................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................................~....................'..................................................................................................*................... + // mls v14.4s, v28.4s, v8.4s // ....................................................................................................~..............'........................................................................................................*............. + // cmge v27.4s, v31.4s, v15.4s // ................................................................................................~..................'....................................................................................................*................. + // cmge v28.4s, v15.4s, v30.4s // .................................................................................................~.................'.....................................................................................................*................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................~...........'...........................................................................................................*.......... + // mls v15.4s, v28.4s, v8.4s // ...........................................................................................................~.......'...............................................................................................................*...... + // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................~...............'.......................................................................................................*.............. + // cmge v28.4s, v16.4s, v30.4s // .....................................................................................................~.............'.........................................................................................................*............ + // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................~........'..............................................................................................................*....... + // mls v16.4s, v28.4s, v8.4s // .............................................................................................................~.....'.................................................................................................................*.... + // str q13, [x0], #(16) // .........................................................................................................~.........'.............................................................................................................*........ + // str q14, [x0, #(-16 + 1*(1024/8))] // ............................................................................................................~......'................................................................................................................*..... + // str q15, [x0, #(-16 + 2*(1024/8))] // ...............................................................................................................~...'...................................................................................................................*.. + // str q16, [x0, #(-16 + 3*(1024/8))] // .................................................................................................................~.'.....................................................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + // Instructions: 107 + // Expected cycles: 108 + // Expected IPC: 0.99 + // + // Wall time: 2.05s + // User time: 2.05s + // + // ------------------------------------------- original position --------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------ + mul v27.4S, v5.4S, v2.S[2] // ...*....................................................................................................... + add v11.4S, v18.4S, v16.4S // .....*..................................................................................................... + ldr q7, [x0, #0] // *.......................................................................................................... + ldr q23, [x0, #128] // .*......................................................................................................... + sub v24.4S, v13.4S, v20.4S // ..*........................................................................................................ + // gap // ........................................................................................................... + add v9.4S, v13.4S, v20.4S // ....*...................................................................................................... + mls v27.4S, v4.4S, v8.S[0] // ...........*............................................................................................... + // gap // ........................................................................................................... + sub v13.4S, v11.4S, v14.4S // ..........*................................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v11.4S, v11.4S, v14.4S // ............*.............................................................................................. + mul v20.4S, v24.4S, v2.S[0] // ......*.................................................................................................... + // gap // ........................................................................................................... + sub v21.4S, v7.4S, v23.4S // .......*................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v7.4S, v7.4S, v23.4S // ........*.................................................................................................. + sqrdmulh v23.4S, v24.4S, v2.S[1] // .........*................................................................................................. + // gap // ........................................................................................................... + sub v24.4S, v27.4S, v19.4S // .................*......................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v27.4S, v27.4S, v19.4S // ...................*....................................................................................... + mul v15.4S, v21.4S, v1.S[2] // ...............*........................................................................................... + // gap // ........................................................................................................... + sub v18.4S, v7.4S, v9.4S // .............*............................................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v7.4S, v7.4S, v9.4S // ..............*............................................................................................ + mul v9.4S, v13.4S, v1.S[0] // ........................*.................................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v13.4S, v13.4S, v1.S[1] // .........................*................................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v16.4S, v7.4S, v11.4S // ....................*...................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v11.4S, v7.4S, v11.4S // .....................*..................................................................................... + mls v20.4S, v23.4S, v8.S[0] // ................*.......................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v7.4S, v21.4S, v1.S[3] // ..................*........................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v23.4S, v18.4S, v0.S[2] // ......................*.................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v21.4S, v18.4S, v0.S[3] // .......................*................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v18.4S, v24.4S, v1.S[0] // ...............................*........................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v24.4S, v24.4S, v1.S[1] // .....................................*..................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v15.4S, v7.4S, v8.S[0] // ..........................*................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v23.4S, v21.4S, v8.S[0] // ...........................*............................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v9.4S, v13.4S, v8.S[0] // ............................*.............................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v7.4S, v15.4S, v20.4S // ..............................*............................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v13.4S, v15.4S, v20.4S // .............................*............................................................................. + mul v20.4S, v16.4S, v0.S[0] // .......................................*................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v18.4S, v24.4S, v8.S[0] // .........................................*................................................................. + sub v24.4S, v7.4S, v27.4S // .................................*......................................................................... + // gap // ........................................................................................................... + add v27.4S, v7.4S, v27.4S // ..................................*........................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v7.4S, v13.4S, v0.S[2] // ................................*.......................................................................... + sub v21.4S, v23.4S, v9.4S // ....................................*...................................................................... + // gap // ........................................................................................................... + add v23.4S, v23.4S, v9.4S // ......................................*.................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v9.4S, v13.4S, v0.S[3] // ...................................*....................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v13.4S, v16.4S, v0.S[1] // ..........................................*................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v15.4S, v24.4S, v0.S[0] // ...........................................*............................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v16.4S, v11.4S, v25.4S // ..............................................*............................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v11.4S, v11.4S, v26.4S // ...............................................*........................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v7.4S, v9.4S, v8.S[0] // ........................................*.................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v20.4S, v13.4S, v8.S[0] // ................................................*.......................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v24.4S, v24.4S, v0.S[1] // .................................................*......................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v9.4S, v7.4S, v18.4S // ............................................*.............................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v7.4S, v7.4S, v18.4S // .............................................*............................................................. + mul v13.4S, v21.4S, v0.S[0] // ..................................................*........................................................ + // gap // ........................................................................................................... + cmge v18.4S, v31.4S, v20.4S // ...................................................*....................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v21.4S, v21.4S, v0.S[1] // ....................................................*...................................................... + cmge v17.4S, v20.4S, v30.4S // .....................................................*..................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v15.4S, v24.4S, v8.S[0] // ......................................................*.................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v24.4S, v18.4S, v17.4S // .......................................................*................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v18.4S, v9.4S, v0.S[0] // ........................................................*.................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v13.4S, v21.4S, v8.S[0] // .........................................................*................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v21.4S, v31.4S, v15.4S // ..........................................................*................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v9.4S, v9.4S, v0.S[1] // ...........................................................*............................................... + cmge v17.4S, v15.4S, v30.4S // ............................................................*.............................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v20.4S, v24.4S, v8.4S // .............................................................*............................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v24.4S, v21.4S, v17.4S // ..............................................................*............................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v16.4S, v11.4S, v8.S[0] // ...............................................................*........................................... + cmge v11.4S, v31.4S, v13.4S // ................................................................*.......................................... + // gap // ........................................................................................................... + cmge v21.4S, v13.4S, v30.4S // .................................................................*......................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v18.4S, v9.4S, v8.S[0] // ..................................................................*........................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q20, [x0, #512] // ...................................................................*....................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v15.4S, v24.4S, v8.4S // ....................................................................*...................................... + sub v11.4S, v11.4S, v21.4S // .....................................................................*..................................... + // gap // ........................................................................................................... + cmge v24.4S, v31.4S, v16.4S // ......................................................................*.................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v9.4S, v16.4S, v30.4S // .......................................................................*................................... + mul v20.4S, v27.4S, v25.4S // ........................................................................*.................................. + // gap // ........................................................................................................... + cmge v21.4S, v31.4S, v18.4S // .........................................................................*................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v27.4S, v27.4S, v26.4S // ..............................................................................*............................ + cmge v17.4S, v18.4S, v30.4S // ...........................................................................*............................... + // gap // ........................................................................................................... + str q15, [x0, #640] // ............................................................................*.............................. + sub v24.4S, v24.4S, v9.4S // .............................................................................*............................. + // gap // ........................................................................................................... + mul v9.4S, v23.4S, v25.4S // ................................................................................*.......................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v21.4S, v21.4S, v17.4S // ...............................................................................*........................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v23.4S, v23.4S, v26.4S // ..................................................................................*........................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v20.4S, v27.4S, v8.S[0] // ...................................................................................*....................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v27.4S, v7.4S, v26.4S // ....................................................................................*...................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v9.4S, v23.4S, v8.S[0] // .....................................................................................*..................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v23.4S, v31.4S, v20.4S // ......................................................................................*.................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v7.4S, v7.4S, v25.4S // .......................................................................................*................... + cmge v15.4S, v20.4S, v30.4S // ........................................................................................*.................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v7.4S, v27.4S, v8.S[0] // .........................................................................................*................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v27.4S, v23.4S, v15.4S // ..........................................................................................*................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v13.4S, v11.4S, v8.4S // ..........................................................................*................................ + cmge v11.4S, v31.4S, v9.4S // ............................................................................................*.............. + // gap // ........................................................................................................... + cmge v23.4S, v9.4S, v30.4S // .............................................................................................*............. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v18.4S, v21.4S, v8.4S // ...........................................................................................*............... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v21.4S, v31.4S, v7.4S // ...............................................................................................*........... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v16.4S, v24.4S, v8.4S // ..............................................................................................*............ + cmge v24.4S, v7.4S, v30.4S // .................................................................................................*......... + // gap // ........................................................................................................... + str q13, [x0, #768] // .................................................................................*......................... + sub v11.4S, v11.4S, v23.4S // ...................................................................................................*....... + // gap // ........................................................................................................... + mls v20.4S, v27.4S, v8.4S // ................................................................................................*.......... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q18, [x0, #896] // ..................................................................................................*........ + sub v27.4S, v21.4S, v24.4S // .....................................................................................................*..... + // gap // ........................................................................................................... + mls v9.4S, v11.4S, v8.4S // ......................................................................................................*.... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q16, [x0], #(16) // ....................................................................................................*...... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v7.4S, v27.4S, v8.4S // ........................................................................................................*.. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q20, [x0, #112] // .......................................................................................................*... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q9, [x0, #240] // .........................................................................................................*. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q7, [x0, #368] // ..........................................................................................................* + // gap // ........................................................................................................... + // gap // ........................................................................................................... + + // ---------------------------------------------- new position ----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------ + // ldr q27, [x0, #0] // ..*........................................................................................................ + // ldr q11, [x0, #128] // ...*....................................................................................................... + // sub v7.4S, v13.4S, v20.4S // ....*...................................................................................................... + // mul v24.4S, v5.4S, v2.S[2] // *.......................................................................................................... + // add v23.4S, v13.4S, v20.4S // .....*..................................................................................................... + // add v9.4S, v18.4S, v16.4S // .*......................................................................................................... + // mul v15.4S, v7.4S, v2.S[0] // .........*................................................................................................. + // sub v5.4S, v27.4S, v11.4S // ..........*................................................................................................ + // add v27.4S, v27.4S, v11.4S // ...........*............................................................................................... + // sqrdmulh v11.4S, v7.4S, v2.S[1] // ............*.............................................................................................. + // sub v7.4S, v9.4S, v14.4S // .......*................................................................................................... + // mls v24.4S, v4.4S, v8.S[0] // ......*.................................................................................................... + // add v9.4S, v9.4S, v14.4S // ........*.................................................................................................. + // sub v4.4S, v27.4S, v23.4S // ................*.......................................................................................... + // add v27.4S, v27.4S, v23.4S // .................*......................................................................................... + // mul v23.4S, v5.4S, v1.S[2] // ...............*........................................................................................... + // mls v15.4S, v11.4S, v8.S[0] // ......................*.................................................................................... + // sub v11.4S, v24.4S, v19.4S // .............*............................................................................................. + // sqrdmulh v21.4S, v5.4S, v1.S[3] // .......................*................................................................................... + // add v24.4S, v24.4S, v19.4S // ..............*............................................................................................ + // sub v17.4S, v27.4S, v9.4S // ....................*...................................................................................... + // add v27.4S, v27.4S, v9.4S // .....................*..................................................................................... + // mul v9.4S, v4.4S, v0.S[2] // ........................*.................................................................................. + // sqrdmulh v4.4S, v4.4S, v0.S[3] // .........................*................................................................................. + // mul v19.4S, v7.4S, v1.S[0] // ..................*........................................................................................ + // sqrdmulh v7.4S, v7.4S, v1.S[1] // ...................*....................................................................................... + // mls v23.4S, v21.4S, v8.S[0] // ............................*.............................................................................. + // mls v9.4S, v4.4S, v8.S[0] // .............................*............................................................................. + // mls v19.4S, v7.4S, v8.S[0] // ..............................*............................................................................ + // sub v7.4S, v23.4S, v15.4S // ................................*.......................................................................... + // add v23.4S, v23.4S, v15.4S // ...............................*........................................................................... + // mul v21.4S, v11.4S, v1.S[0] // ..........................*................................................................................ + // mul v15.4S, v7.4S, v0.S[2] // .....................................*..................................................................... + // sub v4.4S, v23.4S, v24.4S // ...................................*....................................................................... + // add v23.4S, v23.4S, v24.4S // ....................................*...................................................................... + // sqrdmulh v7.4S, v7.4S, v0.S[3] // ........................................*.................................................................. + // sub v24.4S, v9.4S, v19.4S // ......................................*.................................................................... + // sqrdmulh v11.4S, v11.4S, v1.S[1] // ...........................*............................................................................... + // add v9.4S, v9.4S, v19.4S // .......................................*................................................................... + // mul v19.4S, v17.4S, v0.S[0] // .................................*......................................................................... + // mls v15.4S, v7.4S, v8.S[0] // .............................................*............................................................. + // mls v21.4S, v11.4S, v8.S[0] // ..................................*........................................................................ + // sqrdmulh v11.4S, v17.4S, v0.S[1] // .........................................*................................................................. + // mul v7.4S, v4.4S, v0.S[0] // ..........................................*................................................................ + // sub v17.4S, v15.4S, v21.4S // ................................................*.......................................................... + // add v21.4S, v15.4S, v21.4S // .................................................*......................................................... + // mul v15.4S, v27.4S, v25.4S // ...........................................*............................................................... + // sqrdmulh v27.4S, v27.4S, v26.4S // ............................................*.............................................................. + // mls v19.4S, v11.4S, v8.S[0] // ..............................................*............................................................ + // sqrdmulh v11.4S, v4.4S, v0.S[1] // ...............................................*........................................................... + // mul v4.4S, v24.4S, v0.S[0] // ..................................................*........................................................ + // cmge v29.4S, v31.4S, v19.4S // ...................................................*....................................................... + // sqrdmulh v24.4S, v24.4S, v0.S[1] // ....................................................*...................................................... + // cmge v6.4S, v19.4S, v30.4S // .....................................................*..................................................... + // mls v7.4S, v11.4S, v8.S[0] // ......................................................*.................................................... + // sub v11.4S, v29.4S, v6.4S // .......................................................*................................................... + // mul v29.4S, v17.4S, v0.S[0] // ........................................................*.................................................. + // mls v4.4S, v24.4S, v8.S[0] // .........................................................*................................................. + // cmge v24.4S, v31.4S, v7.4S // ..........................................................*................................................ + // sqrdmulh v17.4S, v17.4S, v0.S[1] // ...........................................................*............................................... + // cmge v6.4S, v7.4S, v30.4S // ............................................................*.............................................. + // mls v19.4S, v11.4S, v8.4S // .............................................................*............................................. + // sub v11.4S, v24.4S, v6.4S // ..............................................................*............................................ + // mls v15.4S, v27.4S, v8.S[0] // ...............................................................*........................................... + // cmge v27.4S, v31.4S, v4.4S // ................................................................*.......................................... + // cmge v24.4S, v4.4S, v30.4S // .................................................................*......................................... + // mls v29.4S, v17.4S, v8.S[0] // ..................................................................*........................................ + // str q19, [x0, #512] // ...................................................................*....................................... + // mls v7.4S, v11.4S, v8.4S // ....................................................................*...................................... + // sub v27.4S, v27.4S, v24.4S // .....................................................................*..................................... + // cmge v11.4S, v31.4S, v15.4S // ......................................................................*.................................... + // cmge v24.4S, v15.4S, v30.4S // .......................................................................*................................... + // mul v17.4S, v23.4S, v25.4S // ........................................................................*.................................. + // cmge v19.4S, v31.4S, v29.4S // .........................................................................*................................. + // mls v4.4S, v27.4S, v8.4S // .........................................................................................*................. + // cmge v27.4S, v29.4S, v30.4S // ...........................................................................*............................... + // str q7, [x0, #640] // ............................................................................*.............................. + // sub v11.4S, v11.4S, v24.4S // .............................................................................*............................. + // sqrdmulh v7.4S, v23.4S, v26.4S // ..........................................................................*................................ + // sub v27.4S, v19.4S, v27.4S // ...............................................................................*........................... + // mul v23.4S, v9.4S, v25.4S // ..............................................................................*............................ + // str q4, [x0, #768] // ................................................................................................*.......... + // sqrdmulh v24.4S, v9.4S, v26.4S // ................................................................................*.......................... + // mls v17.4S, v7.4S, v8.S[0] // .................................................................................*......................... + // sqrdmulh v7.4S, v21.4S, v26.4S // ..................................................................................*........................ + // mls v23.4S, v24.4S, v8.S[0] // ...................................................................................*....................... + // cmge v24.4S, v31.4S, v17.4S // ....................................................................................*...................... + // mul v9.4S, v21.4S, v25.4S // .....................................................................................*..................... + // cmge v21.4S, v17.4S, v30.4S // ......................................................................................*.................... + // mls v9.4S, v7.4S, v8.S[0] // .......................................................................................*................... + // sub v7.4S, v24.4S, v21.4S // ........................................................................................*.................. + // mls v29.4S, v27.4S, v8.4S // ............................................................................................*.............. + // cmge v27.4S, v31.4S, v23.4S // ..........................................................................................*................ + // cmge v24.4S, v23.4S, v30.4S // ...........................................................................................*............... + // mls v15.4S, v11.4S, v8.4S // ..............................................................................................*............ + // cmge v11.4S, v31.4S, v9.4S // .............................................................................................*............. + // mls v17.4S, v7.4S, v8.4S // ..................................................................................................*........ + // cmge v7.4S, v9.4S, v30.4S // ...............................................................................................*........... + // str q29, [x0, #896] // ...................................................................................................*....... + // sub v27.4S, v27.4S, v24.4S // .................................................................................................*......... + // str q15, [x0], #(16) // ......................................................................................................*.... + // sub v11.4S, v11.4S, v7.4S // ....................................................................................................*...... + // mls v23.4S, v27.4S, v8.4S // .....................................................................................................*..... + // str q17, [x0, #112] // ........................................................................................................*.. + // mls v9.4S, v11.4S, v8.4S // .......................................................................................................*... + // str q23, [x0, #240] // .........................................................................................................*. + // str q9, [x0, #368] // ..........................................................................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s new file mode 100644 index 00000000..f05c8e90 --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm.s @@ -0,0 +1,2531 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, consts +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm + .global _intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm: +_intt_dilithium_123_45678_manual_ld4_opt_m1_firestorm: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + // Instructions: 10 + // Expected cycles: 4 + // Expected IPC: 2.50 + // + // Wall time: 0.09s + // User time: 0.09s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q29, [x1, #0] // .....*........................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q11, [x5, #96] // .........*.................... + ldr q0, [x5, #32] // *............................. + ldr q24, [x5], #(12*16) // .*............................ + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q19, [x1, #32] // ........*..................... + // gap // .............................. + ldr q2, [x5, #-112] // ...*.......................... + // gap // .............................. + ldr q30, [x5, #-128] // ....*......................... + ldr q7, [x1, #48] // ......*....................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + ldr q15, [x5, #-144] // ..*........................... + ldr q6, [x1, #16] // .......*...................... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // ldr q0, [x5, #32] // ..*............................ + // ldr q24, [x5], #(12*16) // ...*........................... + // ldr q15, [x5, #-144] // ........*...................... + // ldr q2, [x5, #-112] // .....*......................... + // ldr q30, [x5, #-128] // ......*........................ + // ldr q29, [x1, #0] // *.............................. + // ldr q7, [x1, #48] // .......*....................... + // ldr q6, [x1, #16] // .........*..................... + // ldr q19, [x1, #32] // ....*.......................... + // ldr q11, [x5, #-96] // .*............................. + + sub count, count, #1 +layer45678_start: + // Instructions: 174 + // Expected cycles: 54 + // Expected IPC: 3.22 + // + // Wall time: 209.71s + // User time: 209.71s + // + // ----------------------------------------------------------------------------- original position -----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + ldr q10, [x2, #32] // ..............*............................................................................................................................................................... + trn2 v25.4S, v29.4S, v6.4S // .....*........................................................................................................................................................................ + trn2 v22.4S, v19.4S, v7.4S // .......*...................................................................................................................................................................... + ldr q14, [x2, #16] // .............*................................................................................................................................................................ + ldr q20, [x2, #0] // ............*................................................................................................................................................................. + trn1 v23.4S, v29.4S, v6.4S // ....*......................................................................................................................................................................... + trn1 v4.4S, v19.4S, v7.4S // ......*....................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + ldr q19, [x2, #48] // ...............*.............................................................................................................................................................. + ldr q26, [x5, #-64] // ....................................................*......................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v17.2D, v23.2D, v4.2D // ..........*................................................................................................................................................................... + trn2 v18.2D, v23.2D, v4.2D // ........*..................................................................................................................................................................... + trn1 v5.2D, v25.2D, v22.2D // ...........*.................................................................................................................................................................. + trn2 v7.2D, v25.2D, v22.2D // .........*.................................................................................................................................................................... + ldr q31, [x5, #-16] // .......................................................*...................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v28.4S, v20.4S, v14.4S // .................*............................................................................................................................................................ + sub v22.4S, v18.4S, v7.4S // ...................................*.......................................................................................................................................... + sub v9.4S, v17.4S, v5.4S // ..............................*............................................................................................................................................... + add v16.4S, v17.4S, v5.4S // ...............................*.............................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v21.4S, v10.4S, v19.4S // ..................*........................................................................................................................................................... + trn2 v6.4S, v10.4S, v19.4S // ...................*.......................................................................................................................................................... + trn1 v19.4S, v20.4S, v14.4S // ................*............................................................................................................................................................. + add v4.4S, v18.4S, v7.4S // ....................................*......................................................................................................................................... + ldr q14, [x5, #-80] // ...................................................*.......................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v0.4S, v9.4S, v0.4S // ................................*............................................................................................................................................. + sqrdmulh v29.4S, v9.4S, v15.4S // .................................*............................................................................................................................................ + sqrdmulh v17.4S, v22.4S, v2.4S // ......................................*....................................................................................................................................... + mul v22.4S, v22.4S, v30.4S // .....................................*........................................................................................................................................ + ldr q30, [x5, #-48] // .....................................................*........................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v20.2D, v28.2D, v6.2D // .....................*........................................................................................................................................................ + trn1 v28.2D, v28.2D, v6.2D // .......................*...................................................................................................................................................... + trn2 v10.2D, v19.2D, v21.2D // ....................*......................................................................................................................................................... + trn1 v21.2D, v19.2D, v21.2D // ......................*....................................................................................................................................................... + ldr q6, [x5, #-32] // ......................................................*....................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q18, [x5, #-176] // .........................*.................................................................................................................................................... + sub v12.4S, v16.4S, v4.4S // ........................................*..................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v22.4S, v17.4S, v8.S[0] // .......................................*...................................................................................................................................... + ldr q17, [x4, #16] // .............................................................................................*................................................................................ + sub v13.4S, v21.4S, v28.4S // ........................................................*..................................................................................................................... + sub v27.4S, v10.4S, v20.4S // .............................................................*................................................................................................................ + mls v0.4S, v29.4S, v8.S[0] // ..................................*........................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v2.4S, v21.4S, v28.4S // .........................................................*.................................................................................................................... + add v28.4S, v10.4S, v20.4S // ..............................................................*............................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v19.4S, v13.4S, v30.4S // ...........................................................*.................................................................................................................. + mul v29.4S, v13.4S, v26.4S // ..........................................................*................................................................................................................... + mul v30.4S, v27.4S, v6.4S // ...............................................................*.............................................................................................................. + sqrdmulh v6.4S, v27.4S, v31.4S // ................................................................*............................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v21.4S, v0.4S, v22.4S // ..............................................*............................................................................................................................... + sub v31.4S, v0.4S, v22.4S // .............................................*................................................................................................................................ + ldr q0, [x5, #32] // ..........................e................................................................................................................................................... + sub v10.4S, v2.4S, v28.4S // ..................................................................*........................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v27.4S, v12.4S, v24.4S // ..........................................*................................................................................................................................... + sqrdmulh v12.4S, v12.4S, v18.4S // ...........................................*.................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v30.4S, v6.4S, v8.S[0] // .................................................................*............................................................................................................ + ldr q6, [x4, #32] // ..............................................................................................*............................................................................... + sqrdmulh v3.4S, v31.4S, v18.4S // ................................................*............................................................................................................................. + mls v29.4S, v19.4S, v8.S[0] // ............................................................*................................................................................................................. + mul v19.4S, v31.4S, v24.4S // ...............................................*.............................................................................................................................. + ldr q24, [x5], #(12*16) // ........................e..................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v31.4S, v16.4S, v4.4S // .........................................*.................................................................................................................................... + ldr q16, [x4], #64 // ............................................................................................*................................................................................. + mul v22.4S, v10.4S, v11.4S // ....................................................................*......................................................................................................... + sqrdmulh v10.4S, v10.4S, v14.4S // .....................................................................*........................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v27.4S, v12.4S, v8.S[0] // ............................................*................................................................................................................................. + add v12.4S, v2.4S, v28.4S // ...................................................................*.......................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v19.4S, v3.4S, v8.S[0] // .................................................*............................................................................................................................ + sub v1.4S, v29.4S, v30.4S // .......................................................................*...................................................................................................... + add v28.4S, v29.4S, v30.4S // ........................................................................*..................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v22.4S, v10.4S, v8.S[0] // ......................................................................*....................................................................................................... + trn2 v30.4S, v31.4S, v21.4S // .............................................................................*................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v25.4S, v12.4S, v28.4S // ....................................................................................*......................................................................................... + trn2 v28.4S, v12.4S, v28.4S // .....................................................................................*........................................................................................ + sqrdmulh v10.4S, v1.4S, v14.4S // ..........................................................................*................................................................................................... + mul v5.4S, v1.4S, v11.4S // .........................................................................*.................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v26.4S, v27.4S, v19.4S // ...............................................................................*.............................................................................................. + trn1 v27.4S, v27.4S, v19.4S // ..............................................................................*............................................................................................... + trn1 v19.4S, v31.4S, v21.4S // ............................................................................*................................................................................................. + ldr q31, [x4, #-16] // ...............................................................................................*.............................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v5.4S, v10.4S, v8.S[0] // ...........................................................................*.................................................................................................. + trn2 v10.2D, v19.2D, v27.2D // ................................................................................*............................................................................................. + trn1 v19.2D, v19.2D, v27.2D // ..................................................................................*........................................................................................... + trn2 v29.2D, v30.2D, v26.2D // .................................................................................*............................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v4.2D, v30.2D, v26.2D // ...................................................................................*.......................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v12.4S, v10.4S, v29.4S // .....................................................................................................*........................................................................ + add v23.4S, v10.4S, v29.4S // ......................................................................................................*....................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v30.4S, v22.4S, v5.4S // ......................................................................................*....................................................................................... + trn2 v9.4S, v22.4S, v5.4S // .......................................................................................*...................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v14.4S, v12.4S, v6.S[0] // .......................................................................................................*...................................................................... + add v18.4S, v19.4S, v4.4S // .................................................................................................*............................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v3.2D, v25.2D, v30.2D // ........................................................................................*..................................................................................... + trn1 v10.2D, v28.2D, v9.2D // ...........................................................................................*.................................................................................. + trn2 v7.2D, v28.2D, v9.2D // .........................................................................................*.................................................................................... + trn1 v11.2D, v25.2D, v30.2D // ..........................................................................................*................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v28.4S, v19.4S, v4.4S // ................................................................................................*............................................................................. + sqrdmulh v5.4S, v12.4S, v6.S[1] // ........................................................................................................*..................................................................... + add v1.4S, v18.4S, v23.4S // .....................................................................................................................*........................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v29.4S, v3.4S, v7.4S // ...............................................................................................................*.............................................................. + add v27.4S, v3.4S, v7.4S // ................................................................................................................*............................................................. + sub v12.4S, v11.4S, v10.4S // ..........................................................................................................*................................................................... + add v7.4S, v11.4S, v10.4S // ...........................................................................................................*.................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v22.4S, v28.4S, v17.S[2] // ..................................................................................................*........................................................................... + sqrdmulh v19.4S, v28.4S, v17.S[3] // ...................................................................................................*.......................................................................... + srshr v10.4S, v1.4S, #23 // ........................................................................................................................................*..................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v30.4S, v12.4S, v6.S[2] // ............................................................................................................*................................................................. + sqrdmulh v6.4S, v12.4S, v6.S[3] // .............................................................................................................*................................................................ + mul v20.4S, v29.4S, v31.S[0] // .................................................................................................................*............................................................ + sqrdmulh v15.4S, v29.4S, v31.S[1] // ..................................................................................................................*........................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v14.4S, v5.4S, v8.S[0] // .........................................................................................................*.................................................................... + sub v2.4S, v7.4S, v27.4S // ..............................................................................................................................*............................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v1.4S, v10.4S, v8.4S // .........................................................................................................................................*.................................... + mls v22.4S, v19.4S, v8.S[0] // ....................................................................................................*......................................................................... + add v19.4S, v7.4S, v27.4S // ...............................................................................................................................*.............................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v20.4S, v15.4S, v8.S[0] // ...................................................................................................................*.......................................................... + mls v30.4S, v6.4S, v8.S[0] // ..............................................................................................................*............................................................... + sub v12.4S, v18.4S, v23.4S // ....................................................................................................................*......................................................... + sqrdmulh v11.4S, v2.4S, v17.S[1] // .................................................................................................................................*............................................ + ldr q15, [x5, #-144] // ...........................e.................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + srshr v10.4S, v19.4S, #23 // ............................................................................................................................................*................................. + mul v29.4S, v2.4S, v17.S[0] // ................................................................................................................................*............................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v28.4S, v22.4S, v14.4S // .........................................................................................................................*.................................................... + add v4.4S, v22.4S, v14.4S // ..........................................................................................................................*................................................... + mul v5.4S, v12.4S, v16.S[2] // ......................................................................................................................*....................................................... + sqrdmulh v27.4S, v12.4S, v16.S[3] // .......................................................................................................................*...................................................... + ldr q2, [x5, #-112] // .............................e................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v31.4S, v30.4S, v20.4S // ...................................................................................................................................*.......................................... + add v18.4S, v30.4S, v20.4S // ....................................................................................................................................*......................................... + ldr q30, [x5, #-128] // ............................e................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v19.4S, v10.4S, v8.4S // .............................................................................................................................................*................................ + mul v22.4S, v28.4S, v16.S[2] // ...........................................................................................................................*.................................................. + sqrdmulh v14.4S, v28.4S, v16.S[3] // ............................................................................................................................*................................................. + srshr v28.4S, v4.4S, #23 // ..........................................................................................................................................*................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v6.4S, v31.4S, v17.S[0] // .....................................................................................................................................*........................................ + sqrdmulh v31.4S, v31.4S, v17.S[1] // ......................................................................................................................................*....................................... + mls v5.4S, v27.4S, v8.S[0] // ........................................................................................................................*..................................................... + srshr v12.4S, v18.4S, #23 // ..............................................................................................................................................*............................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v29.4S, v11.4S, v8.S[0] // ..................................................................................................................................*........................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v11.4S, v1.4S, v19.4S // ................................................................................................................................................*............................. + add v10.4S, v1.4S, v19.4S // .................................................................................................................................................*............................ + mls v22.4S, v14.4S, v8.S[0] // .............................................................................................................................*................................................ + mls v4.4S, v28.4S, v8.4S // ...........................................................................................................................................*.................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v18.4S, v12.4S, v8.4S // ...............................................................................................................................................*.............................. + mls v6.4S, v31.4S, v8.S[0] // .......................................................................................................................................*...................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v7.4S, v11.4S, v16.S[0] // ..................................................................................................................................................*........................... + sqrdmulh v11.4S, v11.4S, v16.S[1] // ...................................................................................................................................................*.......................... + str q10, [x1], #(16*4) // ....................................................................................................................................................................*......... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v23.4S, v5.4S, v29.4S // ..........................................................................................................................................................*................... + add v31.4S, v5.4S, v29.4S // ...........................................................................................................................................................*.................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v12.4S, v4.4S, v18.4S // .....................................................................................................................................................*........................ + sub v27.4S, v22.4S, v6.4S // ...............................................................................................................................................................*.............. + add v13.4S, v4.4S, v18.4S // ......................................................................................................................................................*....................... + add v10.4S, v22.4S, v6.4S // ................................................................................................................................................................*............. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v7.4S, v11.4S, v8.S[0] // ....................................................................................................................................................*......................... + str q31, [x1, #-32] // ......................................................................................................................................................................*....... + mul v31.4S, v23.4S, v16.S[0] // ............................................................................................................................................................*................. + sqrdmulh v3.4S, v23.4S, v16.S[1] // .............................................................................................................................................................*................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v5.4S, v12.4S, v16.S[0] // .......................................................................................................................................................*...................... + sqrdmulh v12.4S, v12.4S, v16.S[1] // ........................................................................................................................................................*..................... + mul v28.4S, v27.4S, v16.S[0] // .................................................................................................................................................................*............ + sqrdmulh v27.4S, v27.4S, v16.S[1] // ..................................................................................................................................................................*........... + str q13, [x1, #-48] // .....................................................................................................................................................................*........ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q10, [x1, #-16] // .......................................................................................................................................................................*...... + add x1, x1, #64 // ............................................................................................................................................................................*. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q7, [x2], #(16*4) // ........................................................................................................................................................................*..... + mls v31.4S, v3.4S, v8.S[0] // ..............................................................................................................................................................*............... + ldr q29, [x1, #0] // e............................................................................................................................................................................. + ldr q7, [x1, #48] // ...e.......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v5.4S, v12.4S, v8.S[0] // .........................................................................................................................................................*.................... + mls v28.4S, v27.4S, v8.S[0] // ...................................................................................................................................................................*.......... + ldr q6, [x1, #16] // .e............................................................................................................................................................................ + ldr q19, [x1, #32] // ..e........................................................................................................................................................................... + ldr q11, [x5, #-96] // ..................................................e........................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q31, [x2, #-32] // ..........................................................................................................................................................................*... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + str q5, [x2, #-48] // .........................................................................................................................................................................*.... + str q28, [x2, #-16] // ...........................................................................................................................................................................*.. + add x2, x2, #64 // .............................................................................................................................................................................* + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + + // ---------------------------------------------------------------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------ + // ldr q9, [x1, #0] // ...................................................................................................................e..........'..................................................................................................................................................................~.......... + // ldr q10, [x1, #16] // .......................................................................................................................e......'......................................................................................................................................................................~...... + // ldr q11, [x1, #32] // ........................................................................................................................e.....'.......................................................................................................................................................................~..... + // ldr q12, [x1, #48] // ....................................................................................................................e.........'...................................................................................................................................................................~......... + // trn1 v25.4s, v9.4s, v10.4s // ..............................................................................................................................'....*........................................................................................................................................................................ + // trn2 v26.4s, v9.4s, v10.4s // ..............................................................................................................................'*............................................................................................................................................................................ + // trn1 v27.4s, v11.4s, v12.4s // ..............................................................................................................................'.....*....................................................................................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // ..............................................................................................................................'.*........................................................................................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ..............................................................................................................................'.........*................................................................................................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ..............................................................................................................................'...........*................................................................................................................................................................. + // trn1 v9.2d, v25.2d, v27.2d // ..............................................................................................................................'........*.................................................................................................................................................................... + // trn1 v10.2d, v26.2d, v28.2d // ..............................................................................................................................'..........*.................................................................................................................................................................. + // ldr q13, [x2, #0] // ..............................................................................................................................'...*......................................................................................................................................................................... + // ldr q14, [x2, #16] // ..............................................................................................................................'..*.......................................................................................................................................................................... + // ldr q15, [x2, #32] // ..............................................................................................................................*............................................................................................................................................................................. + // ldr q16, [x2, #48] // ..............................................................................................................................'......*...................................................................................................................................................................... + // trn1 v25.4s, v13.4s, v14.4s // ..............................................................................................................................'...................*......................................................................................................................................................... + // trn2 v26.4s, v13.4s, v14.4s // ..............................................................................................................................'.............*............................................................................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ..............................................................................................................................'.................*........................................................................................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................................................................'..................*.......................................................................................................................................................... + // trn2 v15.2d, v25.2d, v27.2d // ..............................................................................................................................'.............................*............................................................................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ..............................................................................................................................'...........................*................................................................................................................................................. + // trn1 v13.2d, v25.2d, v27.2d // ..............................................................................................................................'..............................*.............................................................................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // ..............................................................................................................................'............................*................................................................................................................................................ + // ldr q0, [x5], #(12*16) // .........e....................................................................................................................'........................................................~.................................................................................................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ..............................................................................................................................'................................*............................................................................................................................................ + // ldr q1, [x5, #(-12*16 + 2*16)] // e.............................................................................................................................'...............................................~............................................................................................................................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ...................................................................e..........................................................'..................................................................................................................~.......................................................... + // ldr q2, [x5, #(-12*16 + 4*16)] // .............................................................................e................................................'............................................................................................................................~................................................ + // ldr q6, [x5, #(-12*16 + 5*16)] // ..........................................................................e...................................................'.........................................................................................................................~................................................... + // sub v24.4s, v9.4s, v10.4s // ..............................................................................................................................'...............*............................................................................................................................................................. + // add v9.4s, v9.4s, v10.4s // ..............................................................................................................................'................*............................................................................................................................................................ + // mul v10.4s, v24.4s, v1.4s // ..............................................................................................................................'......................*...................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................................................................................................'.......................*..................................................................................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..............................................................................................................................'......................................*...................................................................................................................................... + // sub v24.4s, v11.4s, v12.4s // ..............................................................................................................................'..............*.............................................................................................................................................................. + // add v11.4s, v11.4s, v12.4s // ..............................................................................................................................'....................*........................................................................................................................................................ + // mul v12.4s, v24.4s, v2.4s // ..............................................................................................................................'.........................*................................................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................................................................................................'........................*.................................................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ..............................................................................................................................'..................................*.......................................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ..............................................................................................................................'.................................*........................................................................................................................................... + // add v9.4s, v9.4s, v11.4s // ..........~...................................................................................................................'.........................................................*................................................................................................................... + // mul v11.4s, v24.4s, v0.4s // ..~...........................................................................................................................'.................................................*........................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...~..........................................................................................................................'..................................................*.......................................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ..............~...............................................................................................................'.............................................................*............................................................................................................... + // sub v24.4s, v10.4s, v12.4s // ..............................................................................................................................'..............................................*.............................................................................................................................. + // add v10.4s, v10.4s, v12.4s // ..............................................................................................................................'.............................................*............................................................................................................................... + // mul v12.4s, v24.4s, v0.4s // ........~.....................................................................................................................'.......................................................*..................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ......~.......................................................................................................................'.....................................................*....................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................~.............................................................................................................'...............................................................*............................................................................................................. + // ldr q0, [x5, #(-12*16 + 6*16)] // .........................................................................................................................e....'........................................................................................................................................................................~.... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..............................................................................................................................'.....................*....................................................................................................................................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ..............................................................................................................................'.......*..................................................................................................................................................................... + // ldr q5, [x5, #(-12*16 + 9*16)] // ..............................................................................................................................'..........................*.................................................................................................................................................. + // ldr q2, [x5, #(-12*16 + 10*16)] // ..............................................................................................................................'...............................*............................................................................................................................................. + // ldr q6, [x5, #(-12*16 + 11*16)] // ..............................................................................................................................'............*................................................................................................................................................................ + // sub v24.4s, v13.4s, v14.4s // ..............................................................................................................................'....................................*........................................................................................................................................ + // add v13.4s, v13.4s, v14.4s // ..............................................................................................................................'.......................................*..................................................................................................................................... + // mul v14.4s, v24.4s, v1.4s // ..............................................................................................................................'..........................................*.................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v5.4s // ..............................................................................................................................'.........................................*................................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // .......~......................................................................................................................'......................................................*...................................................................................................................... + // sub v24.4s, v15.4s, v16.4s // ..............................................................................................................................'.....................................*....................................................................................................................................... + // add v15.4s, v15.4s, v16.4s // ..............................................................................................................................'........................................*.................................................................................................................................... + // mul v16.4s, v24.4s, v2.4s // ..............................................................................................................................'...........................................*................................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................................................................................................'............................................*................................................................................................................................ + // mls v16.4s, v24.4s, v8.s[0] // ....~.........................................................................................................................'...................................................*......................................................................................................................... + // sub v24.4s, v13.4s, v15.4s // .~............................................................................................................................'................................................*............................................................................................................................ + // add v13.4s, v13.4s, v15.4s // ...............~..............................................................................................................'..............................................................*.............................................................................................................. + // mul v15.4s, v24.4s, v0.4s // ............~.................................................................................................................'...........................................................*................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v4.4s // .............~................................................................................................................'............................................................*................................................................................................................ + // mls v15.4s, v24.4s, v8.s[0] // ...................~..........................................................................................................'..................................................................*.......................................................................................................... + // sub v24.4s, v14.4s, v16.4s // .................~............................................................................................................'................................................................*............................................................................................................ + // add v14.4s, v14.4s, v16.4s // ..................~...........................................................................................................'.................................................................*........................................................................................................... + // mul v16.4s, v24.4s, v0.4s // ........................~.....................................................................................................'.......................................................................*..................................................................................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .......................~......................................................................................................'......................................................................*...................................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // .............................~................................................................................................'............................................................................*................................................................................................ + // trn1 v25.4s, v9.4s, v10.4s // ...........................~..................................................................................................'..........................................................................*.................................................................................................. + // trn2 v26.4s, v9.4s, v10.4s // ....................~.........................................................................................................'...................................................................*......................................................................................................... + // trn1 v27.4s, v11.4s, v12.4s // ..........................~...................................................................................................'.........................................................................*................................................................................................... + // trn2 v28.4s, v11.4s, v12.4s // .........................~....................................................................................................'........................................................................*.................................................................................................... + // trn2 v11.2d, v25.2d, v27.2d // ..............................~...............................................................................................'.............................................................................*............................................................................................... + // trn2 v12.2d, v26.2d, v28.2d // ................................~.............................................................................................'...............................................................................*............................................................................................. + // trn1 v9.2d, v25.2d, v27.2d // ...............................~..............................................................................................'..............................................................................*.............................................................................................. + // trn1 v10.2d, v26.2d, v28.2d // .................................~............................................................................................'................................................................................*............................................................................................ + // trn1 v25.4s, v13.4s, v14.4s // .....................~........................................................................................................'....................................................................*........................................................................................................ + // trn2 v26.4s, v13.4s, v14.4s // ......................~.......................................................................................................'.....................................................................*....................................................................................................... + // trn1 v27.4s, v15.4s, v16.4s // ....................................~.........................................................................................'...................................................................................*......................................................................................... + // trn2 v28.4s, v15.4s, v16.4s // .....................................~........................................................................................'....................................................................................*........................................................................................ + // trn2 v15.2d, v25.2d, v27.2d // ........................................~.....................................................................................'.......................................................................................*..................................................................................... + // trn2 v16.2d, v26.2d, v28.2d // ..........................................~...................................................................................'.........................................................................................*................................................................................... + // trn1 v13.2d, v25.2d, v27.2d // ...........................................~..................................................................................'..........................................................................................*.................................................................................. + // trn1 v14.2d, v26.2d, v28.2d // .........................................~....................................................................................'........................................................................................*.................................................................................... + // ldr q0, [x4], #64 // ...........~..................................................................................................................'..........................................................*.................................................................................................................. + // ldr q1, [x4, #(-64 + 16)] // ..............................................................................................................................'...................................*......................................................................................................................................... + // ldr q2, [x4, #(-64 + 32)] // .....~........................................................................................................................'....................................................*........................................................................................................................ + // ldr q3, [x4, #(-64 + 48)] // ............................~.................................................................................................'...........................................................................*................................................................................................. + // sub v24.4s, v9.4s, v10.4s // ............................................~.................................................................................'...........................................................................................*................................................................................. + // add v9.4s, v9.4s, v10.4s // .......................................~......................................................................................'......................................................................................*...................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ...................................................~..........................................................................'..................................................................................................*.......................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ....................................................~.........................................................................'...................................................................................................*......................................................................... + // mls v10.4s, v24.4s, v8.s[0] // .............................................................~................................................................'............................................................................................................*................................................................ + // sub v24.4s, v11.4s, v12.4s // ..................................~...........................................................................................'.................................................................................*........................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................~..........................................................................................'..................................................................................*.......................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ......................................~.......................................................................................'.....................................................................................*....................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .............................................~................................................................................'............................................................................................*................................................................................ + // mls v12.4s, v24.4s, v8.s[0] // ..........................................................~...................................................................'.........................................................................................................*................................................................... + // sub v24.4s, v13.4s, v14.4s // .................................................~............................................................................'................................................................................................*............................................................................ + // add v13.4s, v13.4s, v14.4s // ..................................................~...........................................................................'.................................................................................................*........................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ......................................................~.......................................................................'.....................................................................................................*....................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // .......................................................~......................................................................'......................................................................................................*...................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................................................................~.............................................................'...............................................................................................................*............................................................. + // sub v24.4s, v15.4s, v16.4s // ...............................................~..............................................................................'..............................................................................................*.............................................................................. + // add v15.4s, v15.4s, v16.4s // ................................................~.............................................................................'...............................................................................................*............................................................................. + // mul v16.4s, v24.4s, v3.s[0] // ........................................................~.....................................................................'.......................................................................................................*..................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................~....................................................................'........................................................................................................*.................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ...............................................................~..............................................................'..............................................................................................................*.............................................................. + // sub v24.4s, v9.4s, v11.4s // .................................................................~............................................................'................................................................................................................*............................................................ + // add v9.4s, v9.4s, v11.4s // ..............................................~...............................................................................'.............................................................................................*............................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ........................................................................~.....................................................'.......................................................................................................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................~....................................................'........................................................................................................................*.................................................... + // mls v11.4s, v24.4s, v8.s[0] // ....................................................................................~.........................................'...................................................................................................................................*......................................... + // sub v24.4s, v10.4s, v12.4s // ......................................................................~.......................................................'.....................................................................................................................*....................................................... + // add v10.4s, v10.4s, v12.4s // .......................................................................~......................................................'......................................................................................................................*...................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...............................................................................~..............................................'..............................................................................................................................*.............................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................~.............................................'...............................................................................................................................*............................................. + // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................~....................................'........................................................................................................................................*.................................... + // sub v24.4s, v13.4s, v15.4s // ...........................................................~..................................................................'..........................................................................................................*.................................................................. + // add v13.4s, v13.4s, v15.4s // ..............................................................~...............................................................'.............................................................................................................*............................................................... + // mul v15.4s, v24.4s, v1.s[0] // .....................................................................~........................................................'....................................................................................................................*........................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..................................................................~...........................................................'.................................................................................................................*........................................................... + // mls v15.4s, v24.4s, v8.s[0] // ......................................................................................~.......................................'.....................................................................................................................................*....................................... + // sub v24.4s, v14.4s, v16.4s // ...........................................................................~..................................................'..........................................................................................................................*.................................................. + // add v14.4s, v14.4s, v16.4s // ............................................................................~.................................................'...........................................................................................................................*................................................. + // mul v16.4s, v24.4s, v1.s[0] // ..................................................................................~...........................................'.................................................................................................................................*........................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................~..........................................'..................................................................................................................................*.......................................... + // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................~.................................'...........................................................................................................................................*................................. + // srshr v24.4S, v9.4S, #23 // .....................................................~........................................................................'....................................................................................................*........................................................................ + // mls v9.4s, v24.4s, v8.4s // ............................................................~.................................................................'...........................................................................................................*................................................................. + // srshr v24.4S, v10.4S, #23 // .................................................................................~............................................'................................................................................................................................*............................................ + // mls v10.4s, v24.4s, v8.4s // ..........................................................................................~...................................'.........................................................................................................................................*................................... + // srshr v24.4S, v13.4S, #23 // ....................................................................~.........................................................'...................................................................................................................*......................................................... + // mls v13.4s, v24.4s, v8.4s // ..............................................................................~...............................................'.............................................................................................................................*............................................... + // srshr v24.4S, v14.4S, #23 // .....................................................................................~........................................'....................................................................................................................................*........................................ + // mls v14.4s, v24.4s, v8.4s // ...........................................................................................~..................................'..........................................................................................................................................*.................................. + // sub v24.4s, v9.4s, v13.4s // .......................................................................................~......................................'......................................................................................................................................*...................................... + // add v9.4s, v9.4s, v13.4s // ........................................................................................~.....................................'.......................................................................................................................................*..................................... + // mul v13.4s, v24.4s, v0.s[0] // .............................................................................................~................................'............................................................................................................................................*................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................................................................~...............................'.............................................................................................................................................*............................... + // mls v13.4s, v24.4s, v8.s[0] // ......................................................................................................~.......................'.....................................................................................................................................................*....................... + // sub v24.4s, v10.4s, v14.4s // ..................................................................................................~...........................'.................................................................................................................................................*........................... + // add v10.4s, v10.4s, v14.4s // ....................................................................................................~.........................'...................................................................................................................................................*......................... + // mul v14.4s, v24.4s, v0.s[0] // ..........................................................................................................~...................'.........................................................................................................................................................*................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................~..................'..........................................................................................................................................................*.................. + // mls v14.4s, v24.4s, v8.s[0] // .....................................................................................................................~........'....................................................................................................................................................................*........ + // sub v24.4s, v11.4s, v15.4s // ................................................................................................~.............................'...............................................................................................................................................*............................. + // add v11.4s, v11.4s, v15.4s // .................................................................................................~............................'................................................................................................................................................*............................ + // mul v15.4s, v24.4s, v0.s[0] // ........................................................................................................~.....................'.......................................................................................................................................................*..................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................~....................'........................................................................................................................................................*.................... + // mls v15.4s, v24.4s, v8.s[0] // ..................................................................................................................~...........'.................................................................................................................................................................*........... + // sub v24.4s, v12.4s, v16.4s // ...................................................................................................~..........................'..................................................................................................................................................*.......................... + // add v12.4s, v12.4s, v16.4s // .....................................................................................................~........................'....................................................................................................................................................*........................ + // mul v16.4s, v24.4s, v0.s[0] // ............................................................................................................~.................'...........................................................................................................................................................*................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................................................~................'............................................................................................................................................................*................ + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................................~.......'.....................................................................................................................................................................*....... + // str q9, [x1], #(16*4) // ...............................................................................................~..............................'..............................................................................................................................................*.............................. + // str q10, [x1, #(-16*4 + 1*16)] // ..............................................................................................................~...............'.............................................................................................................................................................*............... + // str q11, [x1, #(-16*4 + 2*16)] // .......................................................................................................~......................'......................................................................................................................................................*...................... + // str q12, [x1, #(-16*4 + 3*16)] // ...............................................................................................................~..............'..............................................................................................................................................................*.............. + // str q13, [x2], #(16*4) // .................................................................................................................~............'................................................................................................................................................................*............ + // str q14, [x2, #(-16*4 + 1*16)] // ...........................................................................................................................~..'..........................................................................................................................................................................*.. + // str q15, [x2, #(-16*4 + 2*16)] // ..........................................................................................................................~...'.........................................................................................................................................................................*... + // str q16, [x2, #(-16*4 + 3*16)] // ............................................................................................................................~.'...........................................................................................................................................................................*. + // add x1, x1, #64 // ................................................................................................................~.............'...............................................................................................................................................................*............. + // add x2, x2, #64 // .............................................................................................................................~'............................................................................................................................................................................* + + sub count, count, #1 + cbnz count, layer45678_start + // Instructions: 164 + // Expected cycles: 54 + // Expected IPC: 3.04 + // + // Wall time: 78.88s + // User time: 78.88s + // + // ------------------------------------------------------------------------ original position ------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + trn2 v27.4S, v29.4S, v6.4S // .*.................................................................................................................................................................. + trn1 v23.4S, v29.4S, v6.4S // .....*.............................................................................................................................................................. + trn2 v25.4S, v19.4S, v7.4S // ..*................................................................................................................................................................. + trn1 v7.4S, v19.4S, v7.4S // ......*............................................................................................................................................................. + ldr q9, [x2, #16] // ...*................................................................................................................................................................ + ldr q13, [x2, #0] // ....*............................................................................................................................................................... + ldr q1, [x2, #32] // *................................................................................................................................................................... + // gap // .................................................................................................................................................................... + ldr q3, [x2, #48] // .......*............................................................................................................................................................ + ldr q20, [x5, #-64] // ........*........................................................................................................................................................... + ldr q21, [x5, #-16] // .............*...................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v26.2D, v27.2D, v25.2D // ...........*........................................................................................................................................................ + trn2 v27.2D, v27.2D, v25.2D // ............*....................................................................................................................................................... + trn1 v25.2D, v23.2D, v7.2D // .........*.......................................................................................................................................................... + trn2 v7.2D, v23.2D, v7.2D // ..........*......................................................................................................................................................... + ldr q23, [x5, #-48] // ...........................*........................................................................................................................................ + ldr q18, [x5, #-32] // ................................*................................................................................................................................... + ldr q16, [x5, #-80] // ......................*............................................................................................................................................. + // gap // .................................................................................................................................................................... + ldr q17, [x5, #-176] // .................................*.................................................................................................................................. + ldr q5, [x4, #32] // ....................................................*............................................................................................................... + ldr q4, [x4, #16] // ....................................*............................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v22.4S, v25.4S, v26.4S // ................*................................................................................................................................................... + add v25.4S, v25.4S, v26.4S // .................*.................................................................................................................................................. + sub v26.4S, v7.4S, v27.4S // ...............*.................................................................................................................................................... + trn2 v14.4S, v13.4S, v9.4S // ..............*..................................................................................................................................................... + ldr q19, [x4, #48] // ..........................................................................*......................................................................................... + ldr q29, [x4], #64 // .........................................................*.......................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v9.4S, v13.4S, v9.4S // ....................*............................................................................................................................................... + trn2 v13.4S, v1.4S, v3.4S // ...................*................................................................................................................................................ + trn1 v1.4S, v1.4S, v3.4S // ..................*................................................................................................................................................. + add v27.4S, v7.4S, v27.4S // .....................*.............................................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v7.4S, v22.4S, v0.4S // .......................*............................................................................................................................................ + sqrdmulh v3.4S, v22.4S, v15.4S // ........................*........................................................................................................................................... + sqrdmulh v15.4S, v26.4S, v2.4S // .........................*.......................................................................................................................................... + mul v26.4S, v26.4S, v30.4S // ..........................*......................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v0.2D, v14.2D, v13.2D // ............................*....................................................................................................................................... + trn1 v13.2D, v14.2D, v13.2D // .............................*...................................................................................................................................... + trn2 v2.2D, v9.2D, v1.2D // ..............................*..................................................................................................................................... + trn1 v9.2D, v9.2D, v1.2D // ...............................*.................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v1.4S, v25.4S, v27.4S // ..................................*................................................................................................................................. + add v27.4S, v25.4S, v27.4S // ........................................................*........................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v7.4S, v3.4S, v8.S[0] // .......................................*............................................................................................................................ + mls v26.4S, v15.4S, v8.S[0] // ...................................*................................................................................................................................ + sub v25.4S, v9.4S, v13.4S // .....................................*.............................................................................................................................. + sub v3.4S, v2.4S, v0.4S // ......................................*............................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v15.4S, v2.4S, v0.4S // .........................................*.......................................................................................................................... + add v9.4S, v9.4S, v13.4S // ........................................*........................................................................................................................... + mul v13.4S, v1.4S, v24.4S // .................................................*.................................................................................................................. + sqrdmulh v1.4S, v1.4S, v17.4S // ..................................................*................................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v20.4S, v25.4S, v20.4S // ...........................................*........................................................................................................................ + sqrdmulh v21.4S, v3.4S, v21.4S // .............................................*...................................................................................................................... + sqrdmulh v23.4S, v25.4S, v23.4S // ..........................................*......................................................................................................................... + mul v25.4S, v3.4S, v18.4S // ............................................*....................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v3.4S, v7.4S, v26.4S // ...............................................*.................................................................................................................... + add v7.4S, v7.4S, v26.4S // ..............................................*..................................................................................................................... + sub v26.4S, v9.4S, v15.4S // ................................................*................................................................................................................... + add v9.4S, v9.4S, v15.4S // .............................................................*...................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v13.4S, v1.4S, v8.S[0] // ............................................................*....................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v24.4S, v3.4S, v24.4S // .......................................................*............................................................................................................ + sqrdmulh v1.4S, v3.4S, v17.4S // .....................................................*.............................................................................................................. + mls v20.4S, v23.4S, v8.S[0] // ......................................................*............................................................................................................. + mls v25.4S, v21.4S, v8.S[0] // ...................................................*................................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v23.4S, v27.4S, v7.4S // .........................................................................*.......................................................................................... + trn2 v27.4S, v27.4S, v7.4S // ..................................................................*................................................................................................. + mul v7.4S, v26.4S, v11.4S // ..........................................................*......................................................................................................... + sqrdmulh v3.4S, v26.4S, v16.4S // ...........................................................*........................................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v24.4S, v1.4S, v8.S[0] // ..............................................................*..................................................................................................... + sub v1.4S, v20.4S, v25.4S // ...............................................................*.................................................................................................... + add v25.4S, v20.4S, v25.4S // ................................................................*................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v7.4S, v3.4S, v8.S[0] // .................................................................*.................................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v11.4S, v1.4S, v11.4S // ......................................................................*............................................................................................. + sqrdmulh v1.4S, v1.4S, v16.4S // .....................................................................*.............................................................................................. + trn1 v3.4S, v9.4S, v25.4S // ...................................................................*................................................................................................ + trn2 v25.4S, v9.4S, v25.4S // ....................................................................*............................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v9.4S, v13.4S, v24.4S // ........................................................................*........................................................................................... + trn2 v24.4S, v13.4S, v24.4S // .......................................................................*............................................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v13.2D, v23.2D, v9.2D // ............................................................................*....................................................................................... + trn1 v23.2D, v23.2D, v9.2D // .............................................................................*...................................................................................... + mls v11.4S, v1.4S, v8.S[0] // ...........................................................................*........................................................................................ + trn2 v9.2D, v27.2D, v24.2D // ..............................................................................*..................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v27.2D, v27.2D, v24.2D // ...............................................................................*.................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v24.4S, v13.4S, v9.4S // ................................................................................*................................................................................... + add v9.4S, v13.4S, v9.4S // .................................................................................*.................................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn1 v13.4S, v7.4S, v11.4S // ..................................................................................*................................................................................. + trn2 v11.4S, v7.4S, v11.4S // ...................................................................................*................................................................................ + add v7.4S, v23.4S, v27.4S // .....................................................................................*.............................................................................. + sub v27.4S, v23.4S, v27.4S // ..........................................................................................*......................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v23.4S, v24.4S, v5.S[0] // ....................................................................................*............................................................................... + sqrdmulh v24.4S, v24.4S, v5.S[1] // ...........................................................................................*........................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + trn2 v1.2D, v3.2D, v13.2D // ......................................................................................*............................................................................. + trn1 v13.2D, v3.2D, v13.2D // .........................................................................................*.......................................................................... + trn2 v3.2D, v25.2D, v11.2D // ........................................................................................*........................................................................... + trn1 v11.2D, v25.2D, v11.2D // .......................................................................................*............................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v25.4S, v7.4S, v9.4S // ...............................................................................................................*.................................................... + add v7.4S, v7.4S, v9.4S // ............................................................................................*....................................................................... + mul v9.4S, v27.4S, v4.S[2] // .................................................................................................*.................................................................. + sqrdmulh v27.4S, v27.4S, v4.S[3] // ..................................................................................................*................................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v20.4S, v1.4S, v3.4S // .............................................................................................*...................................................................... + add v1.4S, v1.4S, v3.4S // ..............................................................................................*..................................................................... + sub v3.4S, v13.4S, v11.4S // ...............................................................................................*.................................................................... + add v11.4S, v13.4S, v11.4S // ................................................................................................*................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v23.4S, v24.4S, v8.S[0] // ........................................................................................................*........................................................... + mul v24.4S, v25.4S, v29.S[2] // .....................................................................................................................*.............................................. + sqrdmulh v25.4S, v25.4S, v29.S[3] // ......................................................................................................................*............................................. + srshr v13.4S, v7.4S, #23 // ...................................................................................................*................................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v21.4S, v3.4S, v5.S[2] // ....................................................................................................*............................................................... + sqrdmulh v3.4S, v3.4S, v5.S[3] // .....................................................................................................*.............................................................. + mul v15.4S, v20.4S, v19.S[0] // ......................................................................................................*............................................................. + sqrdmulh v20.4S, v20.4S, v19.S[1] // .......................................................................................................*............................................................ + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v9.4S, v27.4S, v8.S[0] // ...........................................................................................................*........................................................ + sub v27.4S, v11.4S, v1.4S // .........................................................................................................*.......................................................... + add v11.4S, v11.4S, v1.4S // ............................................................................................................*....................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v7.4S, v13.4S, v8.4S // ..........................................................................................................*......................................................... + mls v24.4S, v25.4S, v8.S[0] // ...............................................................................................................................*.................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v21.4S, v3.4S, v8.S[0] // ..............................................................................................................*..................................................... + mls v15.4S, v20.4S, v8.S[0] // .............................................................................................................*...................................................... + srshr v25.4S, v11.4S, #23 // .................................................................................................................*.................................................. + sqrdmulh v13.4S, v27.4S, v4.S[1] // ................................................................................................................*................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v1.4S, v9.4S, v23.4S // ...................................................................................................................*................................................ + add v3.4S, v9.4S, v23.4S // ....................................................................................................................*............................................... + mul v27.4S, v27.4S, v4.S[0] // ..................................................................................................................*................................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v11.4S, v25.4S, v8.4S // .........................................................................................................................*.......................................... + sub v23.4S, v21.4S, v15.4S // .......................................................................................................................*............................................ + add v9.4S, v21.4S, v15.4S // ........................................................................................................................*........................................... + mul v21.4S, v1.4S, v29.S[2] // ..........................................................................................................................*......................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v27.4S, v13.4S, v8.S[0] // .................................................................................................................................*.................................. + sqrdmulh v25.4S, v1.4S, v29.S[3] // ...........................................................................................................................*........................................ + srshr v13.4S, v3.4S, #23 // ............................................................................................................................*....................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mul v20.4S, v23.4S, v4.S[0] // .............................................................................................................................*...................................... + sqrdmulh v23.4S, v23.4S, v4.S[1] // ..............................................................................................................................*..................................... + srshr v1.4S, v9.4S, #23 // ................................................................................................................................*................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v15.4S, v7.4S, v11.4S // ...................................................................................................................................*................................ + sub v11.4S, v7.4S, v11.4S // ..................................................................................................................................*................................. + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + add v7.4S, v24.4S, v27.4S // ............................................................................................................................................*....................... + sub v27.4S, v24.4S, v27.4S // ...........................................................................................................................................*........................ + mls v3.4S, v13.4S, v8.4S // .....................................................................................................................................*.............................. + mls v21.4S, v25.4S, v8.S[0] // ....................................................................................................................................*............................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v9.4S, v1.4S, v8.4S // ......................................................................................................................................*............................. + mls v20.4S, v23.4S, v8.S[0] // .......................................................................................................................................*............................ + str q15, [x1], #(16*4) // ..........................................................................................................................................*......................... + mul v24.4S, v11.4S, v29.S[0] // ........................................................................................................................................*........................... + sqrdmulh v11.4S, v11.4S, v29.S[1] // .........................................................................................................................................*.......................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q7, [x1, #-32] // ..................................................................................................................................................*................. + mul v7.4S, v27.4S, v29.S[0] // ...................................................................................................................................................*................ + sqrdmulh v27.4S, v27.4S, v29.S[1] // ....................................................................................................................................................*............... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + sub v23.4S, v3.4S, v9.4S // .............................................................................................................................................*...................... + add v9.4S, v3.4S, v9.4S // ...............................................................................................................................................*.................... + sub v25.4S, v21.4S, v20.4S // ..............................................................................................................................................*..................... + add v13.4S, v21.4S, v20.4S // ................................................................................................................................................*................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v24.4S, v11.4S, v8.S[0] // .................................................................................................................................................*.................. + mls v7.4S, v27.4S, v8.S[0] // .............................................................................................................................................................*...... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q9, [x1, #-48] // .........................................................................................................................................................*.......... + str q13, [x1, #-16] // ..........................................................................................................................................................*......... + add x1, x1, #64 // ...........................................................................................................................................................*........ + mul v27.4S, v23.4S, v29.S[0] // .....................................................................................................................................................*.............. + sqrdmulh v11.4S, v23.4S, v29.S[1] // ......................................................................................................................................................*............. + mul v23.4S, v25.4S, v29.S[0] // .......................................................................................................................................................*............ + sqrdmulh v25.4S, v25.4S, v29.S[1] // ........................................................................................................................................................*........... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q24, [x2], #(16*4) // ............................................................................................................................................................*....... + str q7, [x2, #-32] // ................................................................................................................................................................*... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + mls v27.4S, v11.4S, v8.S[0] // ..............................................................................................................................................................*..... + mls v23.4S, v25.4S, v8.S[0] // ...............................................................................................................................................................*.... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + str q27, [x2, #-48] // .................................................................................................................................................................*.. + str q23, [x2, #-16] // ..................................................................................................................................................................*. + add x2, x2, #64 // ...................................................................................................................................................................* + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + // gap // .................................................................................................................................................................... + + // -------------------------------------------------------------------------- new position ---------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + // ldr q10, [x2, #32] // ......*............................................................................................................................................................. + // trn2 v25.4S, v29.4S, v6.4S // *................................................................................................................................................................... + // trn2 v22.4S, v19.4S, v7.4S // ..*................................................................................................................................................................. + // ldr q14, [x2, #16] // ....*............................................................................................................................................................... + // ldr q20, [x2, #0] // .....*.............................................................................................................................................................. + // trn1 v23.4S, v29.4S, v6.4S // .*.................................................................................................................................................................. + // trn1 v4.4S, v19.4S, v7.4S // ...*................................................................................................................................................................ + // ldr q19, [x2, #48] // .......*............................................................................................................................................................ + // ldr q26, [x5, #-64] // ........*........................................................................................................................................................... + // trn1 v17.2D, v23.2D, v4.2D // ............*....................................................................................................................................................... + // trn2 v18.2D, v23.2D, v4.2D // .............*...................................................................................................................................................... + // trn1 v5.2D, v25.2D, v22.2D // ..........*......................................................................................................................................................... + // trn2 v7.2D, v25.2D, v22.2D // ...........*........................................................................................................................................................ + // ldr q31, [x5, #-16] // .........*.......................................................................................................................................................... + // trn2 v28.4S, v20.4S, v14.4S // .......................*............................................................................................................................................ + // sub v22.4S, v18.4S, v7.4S // ......................*............................................................................................................................................. + // sub v9.4S, v17.4S, v5.4S // ....................*............................................................................................................................................... + // add v16.4S, v17.4S, v5.4S // .....................*.............................................................................................................................................. + // trn1 v21.4S, v10.4S, v19.4S // ............................*....................................................................................................................................... + // trn2 v6.4S, v10.4S, v19.4S // ...........................*........................................................................................................................................ + // trn1 v19.4S, v20.4S, v14.4S // ..........................*......................................................................................................................................... + // add v4.4S, v18.4S, v7.4S // .............................*...................................................................................................................................... + // ldr q14, [x5, #-80] // ................*................................................................................................................................................... + // mul v0.4S, v9.4S, v0.4S // ..............................*..................................................................................................................................... + // sqrdmulh v29.4S, v9.4S, v15.4S // ...............................*.................................................................................................................................... + // sqrdmulh v17.4S, v22.4S, v2.4S // ................................*................................................................................................................................... + // mul v22.4S, v22.4S, v30.4S // .................................*.................................................................................................................................. + // ldr q30, [x5, #-48] // ..............*..................................................................................................................................................... + // trn2 v20.2D, v28.2D, v6.2D // ..................................*................................................................................................................................. + // trn1 v28.2D, v28.2D, v6.2D // ...................................*................................................................................................................................ + // trn2 v10.2D, v19.2D, v21.2D // ....................................*............................................................................................................................... + // trn1 v21.2D, v19.2D, v21.2D // .....................................*.............................................................................................................................. + // ldr q6, [x5, #-32] // ...............*.................................................................................................................................................... + // ldr q18, [x5, #-176] // .................*.................................................................................................................................................. + // sub v12.4S, v16.4S, v4.4S // ......................................*............................................................................................................................. + // mls v22.4S, v17.4S, v8.S[0] // .........................................*.......................................................................................................................... + // ldr q17, [x4, #16] // ...................*................................................................................................................................................ + // sub v13.4S, v21.4S, v28.4S // ..........................................*......................................................................................................................... + // sub v27.4S, v10.4S, v20.4S // ...........................................*........................................................................................................................ + // mls v0.4S, v29.4S, v8.S[0] // ........................................*........................................................................................................................... + // add v2.4S, v21.4S, v28.4S // .............................................*...................................................................................................................... + // add v28.4S, v10.4S, v20.4S // ............................................*....................................................................................................................... + // sqrdmulh v19.4S, v13.4S, v30.4S // ..................................................*................................................................................................................. + // mul v29.4S, v13.4S, v26.4S // ................................................*................................................................................................................... + // mul v30.4S, v27.4S, v6.4S // ...................................................*................................................................................................................ + // sqrdmulh v6.4S, v27.4S, v31.4S // .................................................*.................................................................................................................. + // add v21.4S, v0.4S, v22.4S // .....................................................*.............................................................................................................. + // sub v31.4S, v0.4S, v22.4S // ....................................................*............................................................................................................... + // sub v10.4S, v2.4S, v28.4S // ......................................................*............................................................................................................. + // mul v27.4S, v12.4S, v24.4S // ..............................................*..................................................................................................................... + // sqrdmulh v12.4S, v12.4S, v18.4S // ...............................................*.................................................................................................................... + // mls v30.4S, v6.4S, v8.S[0] // ............................................................*....................................................................................................... + // ldr q6, [x4, #32] // ..................*................................................................................................................................................. + // sqrdmulh v3.4S, v31.4S, v18.4S // ..........................................................*......................................................................................................... + // mls v29.4S, v19.4S, v8.S[0] // ...........................................................*........................................................................................................ + // mul v19.4S, v31.4S, v24.4S // .........................................................*.......................................................................................................... + // add v31.4S, v16.4S, v4.4S // .......................................*............................................................................................................................ + // ldr q16, [x4], #64 // .........................*.......................................................................................................................................... + // mul v22.4S, v10.4S, v11.4S // ...............................................................*.................................................................................................... + // sqrdmulh v10.4S, v10.4S, v14.4S // ................................................................*................................................................................................... + // mls v27.4S, v12.4S, v8.S[0] // ........................................................*........................................................................................................... + // add v12.4S, v2.4S, v28.4S // .......................................................*............................................................................................................ + // mls v19.4S, v3.4S, v8.S[0] // .................................................................*.................................................................................................. + // sub v1.4S, v29.4S, v30.4S // ..................................................................*................................................................................................. + // add v28.4S, v29.4S, v30.4S // ...................................................................*................................................................................................ + // mls v22.4S, v10.4S, v8.S[0] // ....................................................................*............................................................................................... + // trn2 v30.4S, v31.4S, v21.4S // ..............................................................*..................................................................................................... + // trn1 v25.4S, v12.4S, v28.4S // .......................................................................*............................................................................................ + // trn2 v28.4S, v12.4S, v28.4S // ........................................................................*........................................................................................... + // sqrdmulh v10.4S, v1.4S, v14.4S // ......................................................................*............................................................................................. + // mul v5.4S, v1.4S, v11.4S // .....................................................................*.............................................................................................. + // trn2 v26.4S, v27.4S, v19.4S // ..........................................................................*......................................................................................... + // trn1 v27.4S, v27.4S, v19.4S // .........................................................................*.......................................................................................... + // trn1 v19.4S, v31.4S, v21.4S // .............................................................*...................................................................................................... + // ldr q31, [x4, #-16] // ........................*........................................................................................................................................... + // mls v5.4S, v10.4S, v8.S[0] // .............................................................................*...................................................................................... + // trn2 v10.2D, v19.2D, v27.2D // ...........................................................................*........................................................................................ + // trn1 v19.2D, v19.2D, v27.2D // ............................................................................*....................................................................................... + // trn2 v29.2D, v30.2D, v26.2D // ..............................................................................*..................................................................................... + // trn1 v4.2D, v30.2D, v26.2D // ...............................................................................*.................................................................................... + // sub v12.4S, v10.4S, v29.4S // ................................................................................*................................................................................... + // add v23.4S, v10.4S, v29.4S // .................................................................................*.................................................................................. + // trn1 v30.4S, v22.4S, v5.4S // ..................................................................................*................................................................................. + // trn2 v9.4S, v22.4S, v5.4S // ...................................................................................*................................................................................ + // mul v14.4S, v12.4S, v6.S[0] // ......................................................................................*............................................................................. + // add v18.4S, v19.4S, v4.4S // ....................................................................................*............................................................................... + // trn2 v3.2D, v25.2D, v30.2D // ........................................................................................*........................................................................... + // trn1 v10.2D, v28.2D, v9.2D // ...........................................................................................*........................................................................ + // trn2 v7.2D, v28.2D, v9.2D // ..........................................................................................*......................................................................... + // trn1 v11.2D, v25.2D, v30.2D // .........................................................................................*.......................................................................... + // sub v28.4S, v19.4S, v4.4S // .....................................................................................*.............................................................................. + // sqrdmulh v5.4S, v12.4S, v6.S[1] // .......................................................................................*............................................................................ + // add v1.4S, v18.4S, v23.4S // .............................................................................................*...................................................................... + // sub v29.4S, v3.4S, v7.4S // ................................................................................................*................................................................... + // add v27.4S, v3.4S, v7.4S // .................................................................................................*.................................................................. + // sub v12.4S, v11.4S, v10.4S // ..................................................................................................*................................................................. + // add v7.4S, v11.4S, v10.4S // ...................................................................................................*................................................................ + // mul v22.4S, v28.4S, v17.S[2] // ..............................................................................................*..................................................................... + // sqrdmulh v19.4S, v28.4S, v17.S[3] // ...............................................................................................*.................................................................... + // srshr v10.4S, v1.4S, #23 // .......................................................................................................*............................................................ + // mul v30.4S, v12.4S, v6.S[2] // ........................................................................................................*........................................................... + // sqrdmulh v6.4S, v12.4S, v6.S[3] // .........................................................................................................*.......................................................... + // mul v20.4S, v29.4S, v31.S[0] // ..........................................................................................................*......................................................... + // sqrdmulh v15.4S, v29.4S, v31.S[1] // ...........................................................................................................*........................................................ + // mls v14.4S, v5.4S, v8.S[0] // ....................................................................................................*............................................................... + // sub v2.4S, v7.4S, v27.4S // .............................................................................................................*...................................................... + // mls v1.4S, v10.4S, v8.4S // ...............................................................................................................*.................................................... + // mls v22.4S, v19.4S, v8.S[0] // ............................................................................................................*....................................................... + // add v19.4S, v7.4S, v27.4S // ..............................................................................................................*..................................................... + // mls v20.4S, v15.4S, v8.S[0] // ..................................................................................................................*................................................. + // mls v30.4S, v6.4S, v8.S[0] // .................................................................................................................*.................................................. + // sub v12.4S, v18.4S, v23.4S // ............................................................................................*....................................................................... + // sqrdmulh v11.4S, v2.4S, v17.S[1] // ....................................................................................................................*............................................... + // srshr v10.4S, v19.4S, #23 // ...................................................................................................................*................................................ + // mul v29.4S, v2.4S, v17.S[0] // .......................................................................................................................*............................................ + // sub v28.4S, v22.4S, v14.4S // .....................................................................................................................*.............................................. + // add v4.4S, v22.4S, v14.4S // ......................................................................................................................*............................................. + // mul v5.4S, v12.4S, v16.S[2] // .....................................................................................................*.............................................................. + // sqrdmulh v27.4S, v12.4S, v16.S[3] // ......................................................................................................*............................................................. + // sub v31.4S, v30.4S, v20.4S // .........................................................................................................................*.......................................... + // add v18.4S, v30.4S, v20.4S // ..........................................................................................................................*......................................... + // mls v19.4S, v10.4S, v8.4S // ........................................................................................................................*........................................... + // mul v22.4S, v28.4S, v16.S[2] // ...........................................................................................................................*........................................ + // sqrdmulh v14.4S, v28.4S, v16.S[3] // .............................................................................................................................*...................................... + // srshr v28.4S, v4.4S, #23 // ..............................................................................................................................*..................................... + // mul v6.4S, v31.4S, v17.S[0] // ...............................................................................................................................*.................................... + // sqrdmulh v31.4S, v31.4S, v17.S[1] // ................................................................................................................................*................................... + // mls v5.4S, v27.4S, v8.S[0] // ................................................................................................................*................................................... + // srshr v12.4S, v18.4S, #23 // .................................................................................................................................*.................................. + // mls v29.4S, v11.4S, v8.S[0] // ............................................................................................................................*....................................... + // sub v11.4S, v1.4S, v19.4S // ...................................................................................................................................*................................ + // add v10.4S, v1.4S, v19.4S // ..................................................................................................................................*................................. + // mls v22.4S, v14.4S, v8.S[0] // .......................................................................................................................................*............................ + // mls v4.4S, v28.4S, v8.4S // ......................................................................................................................................*............................. + // mls v18.4S, v12.4S, v8.4S // ........................................................................................................................................*........................... + // mls v6.4S, v31.4S, v8.S[0] // .........................................................................................................................................*.......................... + // mul v7.4S, v11.4S, v16.S[0] // ...........................................................................................................................................*........................ + // sqrdmulh v11.4S, v11.4S, v16.S[1] // ............................................................................................................................................*....................... + // str q10, [x1], #(16*4) // ..........................................................................................................................................*......................... + // sub v23.4S, v5.4S, v29.4S // .....................................................................................................................................*.............................. + // add v31.4S, v5.4S, v29.4S // ....................................................................................................................................*............................... + // sub v12.4S, v4.4S, v18.4S // ................................................................................................................................................*................... + // sub v27.4S, v22.4S, v6.4S // ..................................................................................................................................................*................. + // add v13.4S, v4.4S, v18.4S // .................................................................................................................................................*.................. + // add v10.4S, v22.4S, v6.4S // ...................................................................................................................................................*................ + // mls v7.4S, v11.4S, v8.S[0] // ....................................................................................................................................................*............... + // str q31, [x1, #-32] // .............................................................................................................................................*...................... + // mul v31.4S, v23.4S, v16.S[0] // ..............................................................................................................................................*..................... + // sqrdmulh v3.4S, v23.4S, v16.S[1] // ...............................................................................................................................................*.................... + // mul v5.4S, v12.4S, v16.S[0] // .........................................................................................................................................................*.......... + // sqrdmulh v12.4S, v12.4S, v16.S[1] // ..........................................................................................................................................................*......... + // mul v28.4S, v27.4S, v16.S[0] // ...........................................................................................................................................................*........ + // sqrdmulh v27.4S, v27.4S, v16.S[1] // ............................................................................................................................................................*....... + // str q13, [x1, #-48] // ......................................................................................................................................................*............. + // str q10, [x1, #-16] // .......................................................................................................................................................*............ + // add x1, x1, #64 // ........................................................................................................................................................*........... + // str q7, [x2], #(16*4) // .............................................................................................................................................................*...... + // mls v31.4S, v3.4S, v8.S[0] // .....................................................................................................................................................*.............. + // mls v5.4S, v12.4S, v8.S[0] // ...............................................................................................................................................................*.... + // mls v28.4S, v27.4S, v8.S[0] // ................................................................................................................................................................*... + // str q31, [x2, #-32] // ..............................................................................................................................................................*..... + // str q5, [x2, #-48] // .................................................................................................................................................................*.. + // str q28, [x2, #-16] // ..................................................................................................................................................................*. + // add x2, x2, #64 // ...................................................................................................................................................................* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + // Instructions: 78 + // Expected cycles: 25 + // Expected IPC: 3.12 + // + // Wall time: 3.49s + // User time: 3.49s + // + // ----------------------------- original position -----------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-- + ldr q11, [x0, #384] // ..*........................................................................... + ldr q27, [x0, #256] // *............................................................................. + ldr q7, [x0, #896] // .*............................................................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + ldr q23, [x0, #768] // ...*.......................................................................... + ldr q24, [x0, #128] // ....*......................................................................... + ldr q9, [x0, #0] // .....*........................................................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + ldr q13, [x0, #640] // ......*....................................................................... + ldr q20, [x0, #512] // .......*...................................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v21.4S, v27.4S, v11.4S // ........*..................................................................... + add v27.4S, v27.4S, v11.4S // .........*.................................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v11.4S, v23.4S, v7.4S // .............*................................................................ + add v7.4S, v23.4S, v7.4S // ............*................................................................. + sub v23.4S, v9.4S, v24.4S // ..........*................................................................... + add v24.4S, v9.4S, v24.4S // ...........*.................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v9.4S, v20.4S, v13.4S // .................*............................................................ + add v13.4S, v20.4S, v13.4S // ................*............................................................. + sqrdmulh v20.4S, v21.4S, v2.S[1] // ..............*............................................................... + mul v21.4S, v21.4S, v2.S[0] // ...............*.............................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v15.4S, v11.4S, v3.S[0] // ..................*........................................................... + sqrdmulh v11.4S, v11.4S, v3.S[1] // .....................*........................................................ + add v18.4S, v24.4S, v27.4S // ....................*......................................................... + mul v16.4S, v23.4S, v1.S[2] // ...................*.......................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v23.4S, v23.4S, v1.S[3] // .......................*...................................................... + mul v17.4S, v9.4S, v2.S[2] // ........................*..................................................... + sqrdmulh v9.4S, v9.4S, v2.S[3] // .........................*.................................................... + add v5.4S, v13.4S, v7.4S // ......................*....................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v27.4S, v24.4S, v27.4S // ...........................*.................................................. + sub v7.4S, v13.4S, v7.4S // ............................*................................................. + mls v21.4S, v20.4S, v8.S[0] // ..........................*................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v15.4S, v11.4S, v8.S[0] // ..............................*............................................... + sub v11.4S, v18.4S, v5.4S // .............................*................................................ + add v24.4S, v18.4S, v5.4S // ...............................*.............................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v16.4S, v23.4S, v8.S[0] // ..................................*........................................... + mls v17.4S, v9.4S, v8.S[0] // ...................................*.......................................... + mul v9.4S, v27.4S, v0.S[2] // ................................*............................................. + sqrdmulh v27.4S, v27.4S, v0.S[3] // .................................*............................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v13.4S, v11.4S, v0.S[1] // .....................................*........................................ + mul v10.4S, v11.4S, v0.S[0] // .......................................*...................................... + mul v23.4S, v24.4S, v25.4S // ....................................*......................................... + sqrdmulh v11.4S, v24.4S, v26.4S // ......................................*....................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v20.4S, v7.4S, v1.S[0] // ........................................*..................................... + sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*.................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + add v24.4S, v16.4S, v21.4S // ...........................................*.................................. + sub v21.4S, v16.4S, v21.4S // ..................................................*........................... + mls v9.4S, v27.4S, v8.S[0] // ............................................*................................. + add v27.4S, v17.4S, v15.4S // ..........................................*................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v18.4S, v17.4S, v15.4S // ........................................................*..................... + mls v10.4S, v13.4S, v8.S[0] // .............................................*................................ + mls v23.4S, v11.4S, v8.S[0] // ..............................................*............................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + add v11.4S, v24.4S, v27.4S // ................................................*............................. + mls v20.4S, v7.4S, v8.S[0] // .................................................*............................ + sub v27.4S, v24.4S, v27.4S // ...............................................*.............................. + sqrdmulh v7.4S, v21.4S, v0.S[3] // ..........................................................*................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v15.4S, v21.4S, v0.S[2] // ...........................................................*.................. + mul v4.4S, v18.4S, v1.S[0] // .............................................................*................ + sqrdmulh v13.4S, v18.4S, v1.S[1] // ...............................................................*.............. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v24.4S, v11.4S, v25.4S // ....................................................*......................... + sqrdmulh v11.4S, v11.4S, v26.4S // .......................................................*...................... + sqrdmulh v21.4S, v27.4S, v0.S[1] // .....................................................*........................ + mul v27.4S, v27.4S, v0.S[0] // ......................................................*....................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v18.4S, v9.4S, v20.4S // .........................................................*.................... + cmge v16.4S, v23.4S, v30.4S // ...................................................*.......................... + cmge v17.4S, v31.4S, v23.4S // ..................................................................*........... + cmge v5.4S, v31.4S, v10.4S // ....................................................................*......... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + add v9.4S, v9.4S, v20.4S // ......................................................................*....... + mls v15.4S, v7.4S, v8.S[0] // ...................................................................*.......... + mls v4.4S, v13.4S, v8.S[0] // .......................................................................*...... + cmge v7.4S, v10.4S, v30.4S // .....................................................................*........ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v24.4S, v11.4S, v8.S[0] // ................................................................*............. + mls v27.4S, v21.4S, v8.S[0] // ............................................................*................. + mul v11.4S, v18.4S, v0.S[0] // ..............................................................*............... + sqrdmulh v13.4S, v18.4S, v0.S[1] // .................................................................*............ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v19.4S, v17.4S, v16.4S // .............................................................................* + sub v6.4S, v5.4S, v7.4S // ...........................................................................*.. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v11.4S, v13.4S, v8.S[0] // ............................................................................*. + cmge v28.4S, v31.4S, v27.4S // ........................................................................*..... + cmge v20.4S, v27.4S, v30.4S // .........................................................................*.... + cmge v22.4S, v24.4S, v30.4S // ..........................................................................*... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + + // ------------------------------- new position --------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|-- + // ldr q16, [x0, #256] // .*............................................................................ + // ldr q12, [x0, #896] // ..*........................................................................... + // ldr q18, [x0, #384] // *............................................................................. + // ldr q5, [x0, #768] // ...*.......................................................................... + // ldr q21, [x0, #128] // ....*......................................................................... + // ldr q20, [x0, #0] // .....*........................................................................ + // ldr q29, [x0, #640] // ......*....................................................................... + // ldr q15, [x0, #512] // .......*...................................................................... + // sub v19.4S, v16.4S, v18.4S // ........*..................................................................... + // add v17.4S, v16.4S, v18.4S // .........*.................................................................... + // sub v23.4S, v20.4S, v21.4S // ............*................................................................. + // add v6.4S, v20.4S, v21.4S // .............*................................................................ + // add v28.4S, v5.4S, v12.4S // ...........*.................................................................. + // sub v21.4S, v5.4S, v12.4S // ..........*................................................................... + // sqrdmulh v5.4S, v19.4S, v2.S[1] // ................*............................................................. + // mul v20.4S, v19.4S, v2.S[0] // .................*............................................................ + // add v19.4S, v15.4S, v29.4S // ...............*.............................................................. + // sub v29.4S, v15.4S, v29.4S // ..............*............................................................... + // mul v15.4S, v21.4S, v3.S[0] // ..................*........................................................... + // mul v27.4S, v23.4S, v1.S[2] // .....................*........................................................ + // add v18.4S, v6.4S, v17.4S // ....................*......................................................... + // sqrdmulh v12.4S, v21.4S, v3.S[1] // ...................*.......................................................... + // add v10.4S, v19.4S, v28.4S // .........................*.................................................... + // sqrdmulh v23.4S, v23.4S, v1.S[3] // ......................*....................................................... + // mul v21.4S, v29.4S, v2.S[2] // .......................*...................................................... + // sqrdmulh v24.4S, v29.4S, v2.S[3] // ........................*..................................................... + // mls v20.4S, v5.4S, v8.S[0] // ............................*................................................. + // sub v6.4S, v6.4S, v17.4S // ..........................*................................................... + // sub v19.4S, v19.4S, v28.4S // ...........................*.................................................. + // sub v16.4S, v18.4S, v10.4S // ..............................*............................................... + // mls v15.4S, v12.4S, v8.S[0] // .............................*................................................ + // add v10.4S, v18.4S, v10.4S // ...............................*.............................................. + // mul v18.4S, v6.4S, v0.S[2] // ..................................*........................................... + // sqrdmulh v12.4S, v6.4S, v0.S[3] // ...................................*.......................................... + // mls v27.4S, v23.4S, v8.S[0] // ................................*............................................. + // mls v21.4S, v24.4S, v8.S[0] // .................................*............................................ + // mul v23.4S, v10.4S, v25.4S // ......................................*....................................... + // sqrdmulh v5.4S, v16.4S, v0.S[1] // ....................................*......................................... + // sqrdmulh v17.4S, v10.4S, v26.4S // .......................................*...................................... + // mul v10.4S, v16.4S, v0.S[0] // .....................................*........................................ + // mul v16.4S, v19.4S, v1.S[0] // ........................................*..................................... + // sqrdmulh v28.4S, v19.4S, v1.S[1] // .........................................*.................................... + // add v22.4S, v21.4S, v15.4S // .............................................*................................ + // add v24.4S, v27.4S, v20.4S // ..........................................*................................... + // mls v18.4S, v12.4S, v8.S[0] // ............................................*................................. + // mls v10.4S, v5.4S, v8.S[0] // ...............................................*.............................. + // mls v23.4S, v17.4S, v8.S[0] // ................................................*............................. + // sub v4.4S, v24.4S, v22.4S // ...................................................*.......................... + // add v12.4S, v24.4S, v22.4S // .................................................*............................ + // mls v16.4S, v28.4S, v8.S[0] // ..................................................*........................... + // sub v20.4S, v27.4S, v20.4S // ...........................................*.................................. + // cmge v17.4S, v23.4S, v30.4S // .............................................................*................ + // mul v24.4S, v12.4S, v25.4S // ........................................................*..................... + // sqrdmulh v22.4S, v4.4S, v0.S[1] // ..........................................................*................... + // mul v27.4S, v4.4S, v0.S[0] // ...........................................................*.................. + // sqrdmulh v28.4S, v12.4S, v26.4S // .........................................................*.................... + // sub v29.4S, v21.4S, v15.4S // ..............................................*............................... + // sub v21.4S, v18.4S, v16.4S // ............................................................*................. + // sqrdmulh v14.4S, v20.4S, v0.S[3] // ....................................................*......................... + // mul v15.4S, v20.4S, v0.S[2] // .....................................................*........................ + // mls v27.4S, v22.4S, v8.S[0] // .....................................................................*........ + // mul v4.4S, v29.4S, v1.S[0] // ......................................................*....................... + // mul v11.4S, v21.4S, v0.S[0] // ......................................................................*....... + // sqrdmulh v12.4S, v29.4S, v1.S[1] // .......................................................*...................... + // mls v24.4S, v28.4S, v8.S[0] // ....................................................................*......... + // sqrdmulh v5.4S, v21.4S, v0.S[1] // .......................................................................*...... + // cmge v19.4S, v31.4S, v23.4S // ..............................................................*............... + // mls v15.4S, v14.4S, v8.S[0] // .................................................................*............ + // cmge v29.4S, v31.4S, v10.4S // ...............................................................*.............. + // cmge v21.4S, v10.4S, v30.4S // ...................................................................*.......... + // add v9.4S, v18.4S, v16.4S // ................................................................*............. + // mls v4.4S, v12.4S, v8.S[0] // ..................................................................*........... + // cmge v28.4S, v31.4S, v27.4S // ...........................................................................*.. + // cmge v20.4S, v27.4S, v30.4S // ............................................................................*. + // cmge v22.4S, v24.4S, v30.4S // .............................................................................* + // sub v6.4S, v29.4S, v21.4S // .........................................................................*.... + // mls v11.4S, v5.4S, v8.S[0] // ..........................................................................*... + // sub v19.4S, v19.4S, v17.4S // ........................................................................*..... + + sub count, count, #1 +layer123_start: + // Instructions: 120 + // Expected cycles: 26 + // Expected IPC: 4.62 + // + // Wall time: 966.20s + // User time: 966.20s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- + ldr q16, [x0, #272] // ..e..................................................................................................................... + ldr q12, [x0, #912] // .......e................................................................................................................ + ldr q18, [x0, #400] // ...e.................................................................................................................... + cmge v7.4S, v31.4S, v24.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + sub v17.4S, v28.4S, v20.4S // ..........................................................................*............................................. + sqrdmulh v13.4S, v9.4S, v26.4S // ...............................................................................................*........................ + mul v9.4S, v9.4S, v25.4S // ..............................................................................................*......................... + ldr q5, [x0, #784] // ......e................................................................................................................. + mls v10.4S, v6.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + sub v28.4S, v15.4S, v4.4S // ...............................................................*........................................................ + add v6.4S, v15.4S, v4.4S // ................................................................*....................................................... + mls v23.4S, v19.4S, v8.4S // .......................................................................................................*................ + ldr q21, [x0, #144] // .e...................................................................................................................... + ldr q20, [x0, #16] // e....................................................................................................................... + mls v27.4S, v17.4S, v8.4S // ...........................................................................*............................................ + sub v17.4S, v7.4S, v22.4S // ..........................................................................................................*............. + cmge v4.4S, v11.4S, v30.4S // .............................................................................*.......................................... + cmge v14.4S, v31.4S, v11.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q29, [x0, #656] // .....e.................................................................................................................. + ldr q15, [x0, #528] // ....e................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v7.4S, v28.4S, v0.S[0] // .................................................................*...................................................... + mls v9.4S, v13.4S, v8.S[0] // ................................................................................................*....................... + sqrdmulh v22.4S, v28.4S, v0.S[1] // ..................................................................*..................................................... + mul v13.4S, v6.4S, v25.4S // .................................................................................................*...................... + mls v24.4S, v17.4S, v8.4S // ...........................................................................................................*............ + sub v19.4S, v16.4S, v18.4S // .............e.......................................................................................................... + add v17.4S, v16.4S, v18.4S // ..............e......................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v16.4S, v6.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q23, [x0], #(16) // ....................................................................................................................*... + sub v23.4S, v20.4S, v21.4S // ........e............................................................................................................... + add v6.4S, v20.4S, v21.4S // .........e.............................................................................................................. + add v28.4S, v5.4S, v12.4S // ........................e............................................................................................... + sub v21.4S, v5.4S, v12.4S // .......................e................................................................................................ + sqrdmulh v5.4S, v19.4S, v2.S[1] // ................e....................................................................................................... + mul v20.4S, v19.4S, v2.S[0] // ...............e........................................................................................................ + add v19.4S, v15.4S, v29.4S // ...................e.................................................................................................... + str q27, [x0, #624] // .....................................................................................*.................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v29.4S, v15.4S, v29.4S // ..................e..................................................................................................... + str q10, [x0, #496] // ....................................................................................*................................... + str q24, [x0, #112] // .....................................................................................................................*.. + mul v15.4S, v21.4S, v3.S[0] // .........................e.............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v27.4S, v23.4S, v1.S[2] // ..........e............................................................................................................. + add v18.4S, v6.4S, v17.4S // .............................e.......................................................................................... + sqrdmulh v12.4S, v21.4S, v3.S[1] // ..........................e............................................................................................. + add v10.4S, v19.4S, v28.4S // .......................................e................................................................................ + sqrdmulh v23.4S, v23.4S, v1.S[3] // ...........e............................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v21.4S, v29.4S, v2.S[2] // ....................e................................................................................................... + sqrdmulh v24.4S, v29.4S, v2.S[3] // .....................e.................................................................................................. + // gap // ........................................................................................................................ + mls v20.4S, v5.4S, v8.S[0] // .................e...................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v16.4S, v8.S[0] // ...................................................................................................*.................... + cmge v29.4S, v9.4S, v30.4S // .............................................................................................................*.......... + sub v6.4S, v6.4S, v17.4S // ............................e........................................................................................... + sub v19.4S, v19.4S, v28.4S // ......................................e................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v16.4S, v18.4S, v10.4S // ................................................e....................................................................... + mls v15.4S, v12.4S, v8.S[0] // ...........................e............................................................................................ + add v10.4S, v18.4S, v10.4S // .................................................e...................................................................... + mul v18.4S, v6.4S, v0.S[2] // ..............................e......................................................................................... + sqrdmulh v12.4S, v6.4S, v0.S[3] // ...............................e........................................................................................ + mls v27.4S, v23.4S, v8.S[0] // ............e........................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v24.4S, v8.S[0] // ......................e................................................................................................. + mul v23.4S, v10.4S, v25.4S // ........................................................................................e............................... + // gap // ........................................................................................................................ + sqrdmulh v5.4S, v16.4S, v0.S[1] // ...................................................e.................................................................... + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v10.4S, v26.4S // .........................................................................................e.............................. + // gap // ........................................................................................................................ + mul v10.4S, v16.4S, v0.S[0] // ..................................................e..................................................................... + // gap // ........................................................................................................................ + cmge v6.4S, v31.4S, v9.4S // ............................................................................................................*........... + mls v7.4S, v22.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v16.4S, v19.4S, v1.S[0] // ........................................e............................................................................... + sqrdmulh v28.4S, v19.4S, v1.S[1] // .........................................e.............................................................................. + add v22.4S, v21.4S, v15.4S // ............................................e........................................................................... + // gap // ........................................................................................................................ + add v24.4S, v27.4S, v20.4S // ..................................e..................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v19.4S, v13.4S, v30.4S // .................................................................................................................*...... + mls v18.4S, v12.4S, v8.S[0] // ................................e....................................................................................... + mls v10.4S, v5.4S, v8.S[0] // ....................................................e................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v23.4S, v17.4S, v8.S[0] // ..........................................................................................e............................. + sub v29.4S, v6.4S, v29.4S // ..............................................................................................................*......... + sub v17.4S, v14.4S, v4.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + cmge v6.4S, v7.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + sub v4.4S, v24.4S, v22.4S // .....................................................e.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v12.4S, v24.4S, v22.4S // ......................................................e................................................................. + mls v16.4S, v28.4S, v8.S[0] // ..........................................e............................................................................. + cmge v5.4S, v31.4S, v13.4S // ................................................................................................................*....... + cmge v14.4S, v31.4S, v7.4S // ................................................................................*....................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v20.4S, v27.4S, v20.4S // .................................e...................................................................................... + mls v11.4S, v17.4S, v8.4S // ...............................................................................*........................................ + cmge v17.4S, v23.4S, v30.4S // .....................................................................................................e.................. + mul v24.4S, v12.4S, v25.4S // ...........................................................................................e............................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v22.4S, v4.4S, v0.S[1] // ........................................................e............................................................... + mul v27.4S, v4.4S, v0.S[0] // .......................................................e................................................................ + sqrdmulh v28.4S, v12.4S, v26.4S // ............................................................................................e........................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v9.4S, v29.4S, v8.4S // ...............................................................................................................*........ + sub v29.4S, v21.4S, v15.4S // ...........................................e............................................................................ + sub v21.4S, v18.4S, v16.4S // ..........................................................e............................................................. + sub v5.4S, v5.4S, v19.4S // ..................................................................................................................*..... + sub v19.4S, v14.4S, v6.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v20.4S, v0.S[3] // ....................................e................................................................................... + mul v15.4S, v20.4S, v0.S[2] // ...................................e.................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v27.4S, v22.4S, v8.S[0] // .........................................................e.............................................................. + mul v4.4S, v29.4S, v1.S[0] // .............................................e.......................................................................... + str q11, [x0, #752] // ......................................................................................*................................. + mul v11.4S, v21.4S, v0.S[0] // ............................................................e........................................................... + sqrdmulh v12.4S, v29.4S, v1.S[1] // ..............................................e......................................................................... + // gap // ........................................................................................................................ + mls v13.4S, v5.4S, v8.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + mls v24.4S, v28.4S, v8.S[0] // .............................................................................................e.......................... + sqrdmulh v5.4S, v21.4S, v0.S[1] // .............................................................e.......................................................... + // gap // ........................................................................................................................ + mls v7.4S, v19.4S, v8.4S // ...................................................................................*.................................... + // gap // ........................................................................................................................ + cmge v19.4S, v31.4S, v23.4S // ....................................................................................................e................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q9, [x0, #240] // ......................................................................................................................*. + mls v15.4S, v14.4S, v8.S[0] // .....................................e.................................................................................. + cmge v29.4S, v31.4S, v10.4S // ....................................................................e................................................... + cmge v21.4S, v10.4S, v30.4S // .....................................................................e.................................................. + add v9.4S, v18.4S, v16.4S // ...........................................................e............................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v4.4S, v12.4S, v8.S[0] // ...............................................e........................................................................ + cmge v28.4S, v31.4S, v27.4S // ........................................................................e............................................... + cmge v20.4S, v27.4S, v30.4S // .........................................................................e.............................................. + cmge v22.4S, v24.4S, v30.4S // .........................................................................................................e.............. + sub v6.4S, v29.4S, v21.4S // ......................................................................e................................................. + mls v11.4S, v5.4S, v8.S[0] // ..............................................................e......................................................... + str q7, [x0, #880] // .......................................................................................*................................ + str q13, [x0, #368] // .......................................................................................................................* + sub v19.4S, v19.4S, v17.4S // ......................................................................................................e................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + + // ---------------------------------------------------------------------------------------------------------------- new position ----------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------- + // ldr q9, [x0, #0] // .............e..........................................................................................................'............~......................................................................................................... + // ldr q10, [x0, #(1*(1024/8))] // ............e...........................................................................................................'...........~.......................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // e.......................................................................................................................~...................................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ..e.....................................................................................................................'.~.................................................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // ...................e....................................................................................................'..................~................................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // ..................e.....................................................................................................'.................~.................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .......e................................................................................................................'......~............................................................................................................... + // ldr q16, [x0, #(7*(1024/8))] // .e......................................................................................................................'~..................................................................................................................... + // sub v24.4s, v9.4s, v10.4s // .............................e..........................................................................................'............................~......................................................................................... + // add v9.4s, v9.4s, v10.4s // ..............................e.........................................................................................'.............................~........................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // .........................................e..............................................................................'........................................~............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................e..........................................................................'............................................~......................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..........................................................e.............................................................'.........................................................~............................................................ + // sub v24.4s, v11.4s, v12.4s // .........................e..............................................................................................'........................~............................................................................................. + // add v11.4s, v11.4s, v12.4s // ..........................e.............................................................................................'.........................~............................................................................................ + // mul v12.4s, v24.4s, v2.s[0] // ..................................e.....................................................................................'.................................~.................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .................................e......................................................................................'................................~..................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................................................e.......................................................................'...............................................~...................................................................... + // sub v24.4s, v13.4s, v14.4s // .....................................e..................................................................................'....................................~................................................................................. + // add v13.4s, v13.4s, v14.4s // ...................................e....................................................................................'..................................~................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ..............................................e.........................................................................'.............................................~........................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...............................................e........................................................................'..............................................~....................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ...........................................................e............................................................'..........................................................~........................................................... + // sub v24.4s, v15.4s, v16.4s // ................................e.......................................................................................'...............................~...................................................................................... + // add v15.4s, v15.4s, v16.4s // ...............................e........................................................................................'..............................~....................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ........................................e...............................................................................'.......................................~.............................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...........................................e............................................................................'..........................................~........................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ......................................................e.................................................................'.....................................................~................................................................ + // sub v24.4s, v9.4s, v11.4s // ...................................................e....................................................................'..................................................~................................................................... + // add v9.4s, v9.4s, v11.4s // ..........................................e.............................................................................'.........................................~............................................................................ + // mul v11.4s, v24.4s, v0.s[2] // ........................................................e...............................................................'.......................................................~.............................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................e..............................................................'........................................................~............................................................. + // mls v11.4s, v24.4s, v8.s[0] // .......................................................................e................................................'......................................................................~............................................... + // sub v24.4s, v10.4s, v12.4s // ..................................................................................e.....................................'.................................................................................~.................................... + // add v10.4s, v10.4s, v12.4s // .....................................................................e..................................................'....................................................................~................................................. + // mul v12.4s, v24.4s, v0.s[2] // ...............................................................................................e........................'..............................................................................................~....................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..............................................................................................e.........................'.............................................................................................~........................ + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................................................e............'..........................................................................................................~........... + // sub v24.4s, v13.4s, v15.4s // ....................................................e...................................................................'...................................................~.................................................................. + // add v13.4s, v13.4s, v15.4s // ............................................e...........................................................................'...........................................~.......................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ..................................................................e.....................................................'.................................................................~.................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................e....................................................'..................................................................~................................................... + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................e........................................'..............................................................................~....................................... + // sub v24.4s, v14.4s, v16.4s // ..........................................................................................e.............................'.........................................................................................~............................ + // add v14.4s, v14.4s, v16.4s // ....................................................................e...................................................'...................................................................~.................................................. + // mul v16.4s, v24.4s, v1.s[0] // .................................................................................................e......................'................................................................................................~..................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................e...................'...................................................................................................~.................. + // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................................................e........'..............................................................................................................~....... + // sub v24.4s, v9.4s, v13.4s // .....................................................e..................................................................'....................................................~................................................................. + // add v9.4s, v9.4s, v13.4s // .......................................................e................................................................'......................................................~............................................................... + // mul v13.4s, v24.4s, v0.s[0] // ...............................................................e........................................................'..............................................................~....................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................e..........................................................'............................................................~......................................................... + // mls v13.4s, v24.4s, v8.s[0] // ........................................................................e...............................................'.......................................................................~.............................................. + // sub v24.4s, v10.4s, v14.4s // .............................................................................e..........................................'............................................................................~......................................... + // add v10.4s, v10.4s, v14.4s // ..............................................................................e.........................................'.............................................................................~........................................ + // mul v14.4s, v24.4s, v0.s[0] // .......................................................................................e................................'......................................................................................~............................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................e.................................'.....................................................................................~................................ + // mls v14.4s, v24.4s, v8.s[0] // ................................................................................................e.......................'...............................................................................................~...................... + // sub v24.4s, v11.4s, v15.4s // ...........................................................................................e............................'..........................................................................................~........................... + // add v11.4s, v11.4s, v15.4s // ..............................................................................................................e.........'.............................................................................................................~........ + // mul v15.4s, v24.4s, v0.s[0] // ...................................................................................................e....................'..................................................................................................~................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................e................'......................................................................................................~............... + // mls v15.4s, v24.4s, v8.s[0] // ....................................................................................................................e...'...................................................................................................................~.. + // sub v24.4s, v12.4s, v16.4s // .........~..............................................................................................................'........*............................................................................................................. + // add v12.4s, v12.4s, v16.4s // ..........~.............................................................................................................'.........*............................................................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ....................~...................................................................................................'...................*.................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................~.................................................................................................'.....................*................................................................................................ + // mls v16.4s, v24.4s, v8.s[0] // .................................................................~......................................................'................................................................*..................................................... + // cmge v27.4s, v31.4s, v13.4s // ............................................................................................................e...........'...........................................................................................................~.......... + // cmge v28.4s, v13.4s, v30.4s // .............................................................................................................e..........'............................................................................................................~......... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................................................e....'..................................................................................................................~... + // mls v13.4s, v28.4s, v8.4s // ........~...............................................................................................................'.......*.............................................................................................................. + // cmge v27.4s, v31.4s, v14.4s // ................................................................................................................e.......'...............................................................................................................~...... + // cmge v28.4s, v14.4s, v30.4s // .................................................................................................................e......'................................................................................................................~..... + // sub v28.4s, v27.4s, v28.4s // ....~...................................................................................................................'...*.................................................................................................................. + // mls v14.4s, v28.4s, v8.4s // ..............~.........................................................................................................'.............*........................................................................................................ + // cmge v27.4s, v31.4s, v15.4s // .................~......................................................................................................'................*..................................................................................................... + // cmge v28.4s, v15.4s, v30.4s // ................~.......................................................................................................'...............*...................................................................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................................................~............................................'..........................................................................*........................................... + // mls v15.4s, v28.4s, v8.4s // ...................................................................................~....................................'..................................................................................*................................... + // cmge v27.4s, v31.4s, v16.4s // .................................................................................~......................................'................................................................................*..................................... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................~...........................................'...........................................................................*.......................................... + // sub v28.4s, v27.4s, v28.4s // .............................................................................................~..........................'............................................................................................*......................... + // mls v16.4s, v28.4s, v8.4s // ........................................................................................................~...............'.......................................................................................................*.............. + // str q13, [x0, #(4*(1024/8))] // ......................................~.................................................................................'.....................................*................................................................................ + // str q14, [x0, #(5*(1024/8))] // ....................................~...................................................................................'...................................*.................................................................................. + // str q15, [x0, #(6*(1024/8))] // ..................................................................................................~.....................'.................................................................................................*.................... + // str q16, [x0, #(7*(1024/8))] // .....................................................................................................................~..'....................................................................................................................*. + // mul v13.4s, v9.4s, v25.4s // ............................................................e...........................................................'...........................................................~.......................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ..............................................................e.........................................................'.............................................................~........................................................ + // mls v13.4s, v9.4s, v8.s[0] // .........................................................................e..............................................'........................................................................~............................................. + // mul v14.4s, v10.4s, v25.4s // .....................................................................................e..................................'....................................................................................~................................. + // sqrdmulh v10.4s, v10.4s, v26.4s // ........................................................................................e...............................'.......................................................................................~.............................. + // mls v14.4s, v10.4s, v8.s[0] // ......................................................................................................e.................'.....................................................................................................~................ + // mul v15.4s, v11.4s, v25.4s // ......~.................................................................................................................'.....*................................................................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // .....~..................................................................................................................'....*................................................................................................................. + // mls v15.4s, v11.4s, v8.s[0] // .....................~..................................................................................................'....................*................................................................................................. + // mul v16.4s, v12.4s, v25.4s // .......................~................................................................................................'......................*............................................................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ...........................~............................................................................................'..........................*........................................................................................... + // mls v16.4s, v12.4s, v8.s[0] // .................................................~......................................................................'................................................*..................................................................... + // cmge v27.4s, v31.4s, v13.4s // .........................................................................................................e..............'........................................................................................................~............. + // cmge v28.4s, v13.4s, v30.4s // ....................................................................................e...................................'...................................................................................~.................................. + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................................e'...................................................................................................................... + // mls v13.4s, v28.4s, v8.4s // ...........~............................................................................................................'..........*........................................................................................................... + // cmge v27.4s, v31.4s, v14.4s // ...~....................................................................................................................'..*................................................................................................................... + // cmge v28.4s, v14.4s, v30.4s // ..................................................................................................................e.....'.................................................................................................................~.... + // sub v28.4s, v27.4s, v28.4s // ...............~........................................................................................................'..............*....................................................................................................... + // mls v14.4s, v28.4s, v8.4s // ........................~...............................................................................................'.......................*.............................................................................................. + // cmge v27.4s, v31.4s, v15.4s // ................................................................~.......................................................'...............................................................*...................................................... + // cmge v28.4s, v15.4s, v30.4s // ..................................................~.....................................................................'.................................................*.................................................................... + // sub v28.4s, v27.4s, v28.4s // ..........................................................................~.............................................'.........................................................................*............................................ + // mls v15.4s, v28.4s, v8.4s // .........................................................................................~..............................'........................................................................................*............................. + // cmge v27.4s, v31.4s, v16.4s // ................................................................................~.......................................'...............................................................................*...................................... + // cmge v28.4s, v16.4s, v30.4s // ......................................................................~.................................................'.....................................................................*................................................ + // sub v28.4s, v27.4s, v28.4s // ............................................................................................~...........................'...........................................................................................*.......................... + // mls v16.4s, v28.4s, v8.4s // .....................................................................................................~..................'....................................................................................................*................. + // str q13, [x0], #(16) // ............................~...........................................................................................'...........................*.......................................................................................... + // str q14, [x0, #(-16 + 1*(1024/8))] // .......................................~................................................................................'......................................*............................................................................... + // str q15, [x0, #(-16 + 2*(1024/8))] // ..........................................................................................................~.............'.........................................................................................................*............ + // str q16, [x0, #(-16 + 3*(1024/8))] // ......................................................................................................................~.'.....................................................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + // Instructions: 42 + // Expected cycles: 16 + // Expected IPC: 2.62 + // + // Wall time: 0.44s + // User time: 0.44s + // + // ----------- original position -----------> + // 0 25 + // |------------------------|---------------- + add v21.4S, v15.4S, v4.4S // ......*................................... + sqrdmulh v17.4S, v9.4S, v26.4S // ..*....................................... + mul v16.4S, v9.4S, v25.4S // ...*...................................... + sub v15.4S, v15.4S, v4.4S // .....*.................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + cmge v13.4S, v11.4S, v30.4S // ..........*............................... + cmge v7.4S, v31.4S, v11.4S // ...........*.............................. + cmge v5.4S, v31.4S, v24.4S // *......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mul v4.4S, v21.4S, v25.4S // ...............*.......................... + sqrdmulh v9.4S, v21.4S, v26.4S // .................*........................ + sqrdmulh v21.4S, v15.4S, v0.S[1] // ..............*........................... + mul v15.4S, v15.4S, v0.S[0] // ............*............................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v10.4S, v6.4S, v8.4S // ....*..................................... + mls v23.4S, v19.4S, v8.4S // .......*.................................. + mls v16.4S, v17.4S, v8.S[0] // .............*............................ + sub v7.4S, v7.4S, v13.4S // ............................*............. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + sub v18.4S, v5.4S, v22.4S // .........*................................ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v15.4S, v21.4S, v8.S[0] // .........................*................ + sub v21.4S, v28.4S, v20.4S // .*........................................ + mls v11.4S, v7.4S, v8.4S // ................................*......... + mls v4.4S, v9.4S, v8.S[0] // ......................*................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v24.4S, v18.4S, v8.4S // ................*......................... + str q10, [x0, #512] // ....................*..................... + cmge v13.4S, v16.4S, v30.4S // .......................*.................. + cmge v10.4S, v31.4S, v16.4S // ........................*................. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v27.4S, v21.4S, v8.4S // ........*................................. + str q23, [x0], #(16) // ..................*....................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q11, [x0, #752] // ....................................*..... + cmge v9.4S, v4.4S, v30.4S // ..........................*............... + cmge v11.4S, v31.4S, v4.4S // ..............................*........... + cmge v7.4S, v15.4S, v30.4S // .............................*............ + cmge v23.4S, v31.4S, v15.4S // ...............................*.......... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q24, [x0, #112] // .....................*.................... + sub v24.4S, v10.4S, v13.4S // ...........................*.............. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q27, [x0, #624] // ...................*...................... + sub v11.4S, v11.4S, v9.4S // ..................................*....... + sub v27.4S, v23.4S, v7.4S // ...................................*...... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v16.4S, v24.4S, v8.4S // .................................*........ + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + mls v15.4S, v27.4S, v8.4S // ......................................*... + mls v4.4S, v11.4S, v8.4S // .....................................*.... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q16, [x0, #240] // .......................................*.. + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + str q15, [x0, #880] // ........................................*. + str q4, [x0, #368] // .........................................* + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + // gap // .......................................... + + // ------------- new position --------------> + // 0 25 + // |------------------------|---------------- + // cmge v7.4S, v31.4S, v24.4S // ......*................................... + // sub v17.4S, v28.4S, v20.4S // .................*........................ + // sqrdmulh v13.4S, v9.4S, v26.4S // .*........................................ + // mul v9.4S, v9.4S, v25.4S // ..*....................................... + // mls v10.4S, v6.4S, v8.4S // ...........*.............................. + // sub v28.4S, v15.4S, v4.4S // ...*...................................... + // add v6.4S, v15.4S, v4.4S // *......................................... + // mls v23.4S, v19.4S, v8.4S // ............*............................. + // mls v27.4S, v17.4S, v8.4S // ........................*................. + // sub v17.4S, v7.4S, v22.4S // ...............*.......................... + // cmge v4.4S, v11.4S, v30.4S // ....*..................................... + // cmge v14.4S, v31.4S, v11.4S // .....*.................................... + // mul v7.4S, v28.4S, v0.S[0] // ..........*............................... + // mls v9.4S, v13.4S, v8.S[0] // .............*............................ + // sqrdmulh v22.4S, v28.4S, v0.S[1] // .........*................................ + // mul v13.4S, v6.4S, v25.4S // .......*.................................. + // mls v24.4S, v17.4S, v8.4S // ....................*..................... + // sqrdmulh v16.4S, v6.4S, v26.4S // ........*................................. + // str q23, [x0], #(16) // .........................*................ + // str q27, [x0, #624] // .................................*........ + // str q10, [x0, #496] // .....................*.................... + // str q24, [x0, #112] // ...............................*.......... + // mls v13.4S, v16.4S, v8.S[0] // ...................*...................... + // cmge v29.4S, v9.4S, v30.4S // ......................*................... + // cmge v6.4S, v31.4S, v9.4S // .......................*.................. + // mls v7.4S, v22.4S, v8.S[0] // ................*......................... + // cmge v19.4S, v13.4S, v30.4S // ...........................*.............. + // sub v29.4S, v6.4S, v29.4S // ................................*......... + // sub v17.4S, v14.4S, v4.4S // ..............*........................... + // cmge v6.4S, v7.4S, v30.4S // .............................*............ + // cmge v5.4S, v31.4S, v13.4S // ............................*............. + // cmge v14.4S, v31.4S, v7.4S // ..............................*........... + // mls v11.4S, v17.4S, v8.4S // ..................*....................... + // mls v9.4S, v29.4S, v8.4S // ....................................*..... + // sub v5.4S, v5.4S, v19.4S // ..................................*....... + // sub v19.4S, v14.4S, v6.4S // ...................................*...... + // str q11, [x0, #752] // ..........................*............... + // mls v13.4S, v5.4S, v8.4S // ......................................*... + // mls v7.4S, v19.4S, v8.4S // .....................................*.... + // str q9, [x0, #240] // .......................................*.. + // str q7, [x0, #880] // ........................................*. + // str q13, [x0, #368] // .........................................* + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s new file mode 100644 index 00000000..d6caacdb --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm.s @@ -0,0 +1,2155 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, consts +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // @slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // @slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // @slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // @slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // @slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // @slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // @slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // @slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm + .global _intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm: +_intt_dilithium_123_45678_manual_ld4_opt_m1_icestorm: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + // Instructions: 162 + // Expected cycles: 71 + // Expected IPC: 2.28 + // + // Wall time: 162.16s + // User time: 162.16s + // + // ----------------------------------------------------------------------- original position -----------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- + ldr q25, [x2, #16] // ......................*........................................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + ldr q22, [x2, #0] // ........................*......................................................................................................................................... + ldr q30, [x2, #32] // .........................*........................................................................................................................................ + ldr q9, [x2, #48] // ...............................*.................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + ldr q23, [x1, #32] // ..*............................................................................................................................................................... + // gap // .................................................................................................................................................................. + ldr q10, [x1, #48] // ...*.............................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + ldr q24, [x1, #16] // .*................................................................................................................................................................ + ldr q20, [x1, #0] // *................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v3.4S, v22.4S, v25.4S // ......................................*........................................................................................................................... + ldr q0, [x4, #16] // ........................................................................*......................................................................................... + trn1 v31.4S, v22.4S, v25.4S // ....................................*............................................................................................................................. + ldr q1, [x5, #144] // ..................*............................................................................................................................................... + ldr q16, [x4], #64 // ............................................................................................*..................................................................... + trn2 v29.4S, v30.4S, v9.4S // ..........................................*....................................................................................................................... + // gap // .................................................................................................................................................................. + trn1 v28.4S, v30.4S, v9.4S // ............................................*..................................................................................................................... + ldr q6, [x4, #-16] // .....................................................................*............................................................................................ + trn2 v25.4S, v23.4S, v10.4S // .......*.......................................................................................................................................................... + // gap // .................................................................................................................................................................. + trn1 v21.4S, v23.4S, v10.4S // ........*......................................................................................................................................................... + trn2 v10.2D, v3.2D, v29.2D // ..............................................*................................................................................................................... + ldr q11, [x5, #96] // .........................................*........................................................................................................................ + trn2 v14.4S, v20.4S, v24.4S // ....*............................................................................................................................................................. + ldr q5, [x5, #160] // ...........*...................................................................................................................................................... + trn1 v17.2D, v3.2D, v29.2D // ................................................*................................................................................................................. + ldr q7, [x5, #176] // ................................*................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v26.2D, v31.2D, v28.2D // .................................................*................................................................................................................ + trn1 v12.2D, v14.2D, v25.2D // ................*................................................................................................................................................. + trn1 v2.2D, v31.2D, v28.2D // ....................................................*............................................................................................................. + ldr q31, [x5, #128] // .........*........................................................................................................................................................ + // gap // .................................................................................................................................................................. + ldr q23, [x5, #112] // ........................................*......................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sub v13.4S, v26.4S, v10.4S // .....................................................*............................................................................................................ + sub v19.4S, v2.4S, v17.4S // .........................................................*........................................................................................................ + ldr q29, [x5], #(12*16) // ...................................*.............................................................................................................................. + trn1 v28.4S, v20.4S, v24.4S // .....*............................................................................................................................................................ + ldr q15, [x5, #-128] // ......*........................................................................................................................................................... + ldr q27, [x5, #-144] // ..........*....................................................................................................................................................... + // gap // .................................................................................................................................................................. + mul v9.4S, v13.4S, v5.4S // ................................................................*................................................................................................. + sqrdmulh v20.4S, v13.4S, v7.4S // ............................................................*..................................................................................................... + sqrdmulh v1.4S, v19.4S, v1.4S // .................................................................*................................................................................................ + mul v24.4S, v19.4S, v31.4S // .............................................................*.................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v13.4S, v2.4S, v17.4S // ..........................................................................*....................................................................................... + trn1 v19.2D, v28.2D, v21.2D // .................*................................................................................................................................................ + // gap // .................................................................................................................................................................. + ldr q2, [x5, #-160] // ..............*................................................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v22.4S, v26.4S, v10.4S // ......................................................................*........................................................................................... + mls v9.4S, v20.4S, v8.S[0] // ...........................................................................*...................................................................................... + mls v24.4S, v1.4S, v8.S[0] // .........................................................................*........................................................................................ + sub v17.4S, v19.4S, v12.4S // .....................*............................................................................................................................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v26.4S, v13.4S, v22.4S // ..............................................................................*................................................................................... + sub v10.4S, v13.4S, v22.4S // ...............................................................................*.................................................................................. + sqrdmulh v18.4S, v17.4S, v27.4S // ............................*..................................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v31.2D, v14.2D, v25.2D // ............*..................................................................................................................................................... + sub v4.4S, v24.4S, v9.4S // .................................................................................*................................................................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v27.4S, v17.4S, v2.4S // .............................*.................................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v30.4S, v10.4S, v23.4S // ..................................................................................*............................................................................... + mul v20.4S, v10.4S, v11.4S // ...................................................................................*.............................................................................. + mul v10.4S, v4.4S, v11.4S // .....................................................................................*............................................................................ + sqrdmulh v14.4S, v4.4S, v23.4S // ....................................................................................*............................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v27.4S, v18.4S, v8.S[0] // .......................................*.......................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v3.2D, v28.2D, v21.2D // .............*.................................................................................................................................................... + mls v20.4S, v30.4S, v8.S[0] // .........................................................................................*........................................................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v22.4S, v24.4S, v9.4S // ................................................................................*................................................................................. + // gap // .................................................................................................................................................................. + ldr q17, [x5, #-112] // ...............*.................................................................................................................................................. + sub v4.4S, v3.4S, v31.4S // ...................*.............................................................................................................................................. + mls v10.4S, v14.4S, v8.S[0] // ...........................................................................................*...................................................................... + trn1 v28.4S, v26.4S, v22.4S // ........................................................................................*......................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v26.4S, v26.4S, v22.4S // ......................................................................................*........................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v25.4S, v4.4S, v15.4S // ..........................*....................................................................................................................................... + add v5.4S, v3.4S, v31.4S // ....................*............................................................................................................................................. + trn2 v1.4S, v20.4S, v10.4S // .................................................................................................*................................................................ + trn1 v18.4S, v20.4S, v10.4S // ..................................................................................................*............................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v7.4S, v4.4S, v17.4S // ...........................*...................................................................................................................................... + add v17.4S, v19.4S, v12.4S // .......................*.......................................................................................................................................... + trn2 v19.2D, v28.2D, v18.2D // ......................................................................................................*........................................................... + trn2 v14.2D, v26.2D, v1.2D // .....................................................................................................*............................................................ + // gap // .................................................................................................................................................................. + ldr q31, [x5, #-176] // ..............................*................................................................................................................................... + trn1 v30.2D, v26.2D, v1.2D // .......................................................................................................*.......................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sub v10.4S, v17.4S, v5.4S // .................................*................................................................................................................................ + add v22.4S, v19.4S, v14.4S // ...............................................................................................................*.................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v25.4S, v7.4S, v8.S[0] // .....................................*............................................................................................................................ + ldr q26, [x4, #-32] // ....................................................................*............................................................................................. + // gap // .................................................................................................................................................................. + trn1 v12.2D, v28.2D, v18.2D // ........................................................................................................*......................................................... + mul v11.4S, v10.4S, v29.4S // .............................................*.................................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v21.4S, v10.4S, v31.4S // ...........................................*...................................................................................................................... + add v20.4S, v17.4S, v5.4S // ..................................*............................................................................................................................... + // gap // .................................................................................................................................................................. + add v10.4S, v12.4S, v30.4S // .................................................................................................................*................................................ + // gap // .................................................................................................................................................................. + sub v4.4S, v27.4S, v25.4S // ...............................................*.................................................................................................................. + sub v2.4S, v12.4S, v30.4S // ...........................................................................................................*...................................................... + // gap // .................................................................................................................................................................. + add v17.4S, v27.4S, v25.4S // .......................................................*.......................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v1.4S, v4.4S, v29.4S // ..................................................*............................................................................................................... + sqrdmulh v28.4S, v4.4S, v31.4S // ...................................................*.............................................................................................................. + trn2 v15.4S, v20.4S, v17.4S // ...........................................................*...................................................................................................... + // gap // .................................................................................................................................................................. + sub v12.4S, v10.4S, v22.4S // .....................................................................................................................*............................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v30.4S, v2.4S, v26.S[3] // ...................................................................................................................*.............................................. + // gap // .................................................................................................................................................................. + add v7.4S, v10.4S, v22.4S // ........................................................................................................................*......................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v1.4S, v28.4S, v8.S[0] // ........................................................*......................................................................................................... + mls v11.4S, v21.4S, v8.S[0] // ......................................................*........................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v5.4S, v12.4S, v0.S[0] // ...........................................................................................................................*...................................... + srshr v31.4S, v7.4S, #23 // ............................................................................................................................*..................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v28.4S, v12.4S, v0.S[1] // ..............................................................................................................................*................................... + sub v19.4S, v19.4S, v14.4S // ..........................................................................................................*....................................................... + trn1 v27.4S, v11.4S, v1.4S // ..............................................................*................................................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn1 v12.4S, v20.4S, v17.4S // ..........................................................*....................................................................................................... + mul v13.4S, v2.4S, v26.S[2] // ................................................................................................................*................................................. + trn2 v24.4S, v11.4S, v1.4S // ...............................................................*.................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sqrdmulh v17.4S, v19.4S, v6.S[1] // ..............................................................................................................*................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn2 v4.2D, v12.2D, v27.2D // .......................................................................................*.......................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v18.4S, v19.4S, v6.S[0] // .............................................................................................................*.................................................... + trn2 v2.2D, v15.2D, v24.2D // ............................................................................*..................................................................................... + mls v13.4S, v30.4S, v8.S[0] // .........................................................................................................................*........................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + trn1 v14.2D, v15.2D, v24.2D // ...................................................................*.............................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sub v10.4S, v4.4S, v2.4S // ..........................................................................................*....................................................................... + trn1 v25.2D, v12.2D, v27.2D // ..................................................................*............................................................................................... + mls v18.4S, v17.4S, v8.S[0] // .......................................................................................................................*.......................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v5.4S, v28.4S, v8.S[0] // ..............................................................................................................................................*................... + // gap // .................................................................................................................................................................. + sub v20.4S, v25.4S, v14.4S // .......................................................................*.......................................................................................... + // gap // .................................................................................................................................................................. + mul v15.4S, v10.4S, v26.S[0] // ...................................................................................................*.............................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v30.4S, v25.4S, v14.4S // .............................................................................*.................................................................................... + sqrdmulh v12.4S, v10.4S, v26.S[1] // ................................................................................................*................................................................. + sqrdmulh v23.4S, v20.4S, v0.S[3] // ..............................................................................................*................................................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v1.4S, v20.4S, v0.S[2] // .............................................................................................*.................................................................... + // gap // .................................................................................................................................................................. + add v17.4S, v4.4S, v2.4S // ...............................................................................................*.................................................................. + // gap // .................................................................................................................................................................. + add v4.4S, v13.4S, v18.4S // ...............................................................................................................................*.................................. + sub v6.4S, v13.4S, v18.4S // .................................................................................................................................*................................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v7.4S, v31.4S, v8.4S // .....................................................................................................................................*............................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v15.4S, v12.4S, v8.S[0] // ............................................................................................................*..................................................... + mls v1.4S, v23.4S, v8.S[0] // ....................................................................................................*............................................................. + sqrdmulh v24.4S, v6.4S, v0.S[1] // ........................................................................................................................................*......................... + sub v23.4S, v30.4S, v17.4S // ......................................................................................................................*........................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v10.4S, v6.4S, v0.S[0] // .......................................................................................................................................*.......................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + srshr v28.4S, v4.4S, #23 // ...................................................................................................................................*.............................. + sqrdmulh v18.4S, v23.4S, v16.S[3] // ............................................................................................................................................*..................... + add v2.4S, v1.4S, v15.4S // ....................................................................................................................*............................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v21.4S, v23.4S, v16.S[2] // ..........................................................................................................................................*....................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + sub v12.4S, v1.4S, v15.4S // ................................................................................................................................*................................. + add v26.4S, v30.4S, v17.4S // .........................................................................................................*........................................................ + srshr v30.4S, v2.4S, #23 // ..........................................................................................................................*....................................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mul v0.4S, v12.4S, v16.S[2] // ......................................................................................................................................*........................... + sqrdmulh v13.4S, v12.4S, v16.S[3] // ....................................................................................................................................*............................. + mls v21.4S, v18.4S, v8.S[0] // .................................................................................................................................................*................ + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + srshr v12.4S, v26.4S, #23 // ..................................................................................................................*............................................... + mls v4.4S, v28.4S, v8.4S // .........................................................................................................................................*........................ + mls v2.4S, v30.4S, v8.4S // ..................................................................................................................................*............................... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + mls v10.4S, v24.4S, v8.S[0] // .............................................................................................................................................*.................... + mls v0.4S, v13.4S, v8.S[0] // ...........................................................................................................................................*...................... + add v18.4S, v21.4S, v5.4S // ........................................................................................................................................................*......... + // gap // .................................................................................................................................................................. + mls v26.4S, v12.4S, v8.4S // .............................................................................................................................*.................................... + // gap // .................................................................................................................................................................. + sub v31.4S, v2.4S, v4.4S // ...............................................................................................................................................*.................. + // gap // .................................................................................................................................................................. + add v30.4S, v2.4S, v4.4S // ..................................................................................................................................................*............... + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + // gap // .................................................................................................................................................................. + add v3.4S, v0.4S, v10.4S // ...................................................................................................................................................*.............. + sub v12.4S, v0.4S, v10.4S // ....................................................................................................................................................*............. + sqrdmulh v13.4S, v31.4S, v16.S[1] // ......................................................................................................................................................*........... + str q30, [x1, #16] // .........................................................................................................................................................*........ + mul v25.4S, v31.4S, v16.S[0] // .......................................................................................................................................................*.......... + // gap // .................................................................................................................................................................. + add v10.4S, v26.4S, v7.4S // ................................................................................................................................................*................. + // gap // .................................................................................................................................................................. + sub v1.4S, v26.4S, v7.4S // ............................................................................................................................................................*..... + str q18, [x1, #32] // ...............................................................................................................................................................*.. + // gap // .................................................................................................................................................................. + sqrdmulh v27.4S, v12.4S, v16.S[1] // ..........................................................................................................................................................*....... + mul v11.4S, v12.4S, v16.S[0] // .............................................................................................................................................................*.... + str q3, [x1, #48] // ...........................................................................................................................................................*...... + str q10, [x1], #(16*4) // .....................................................................................................................................................*............ + mls v25.4S, v13.4S, v8.S[0] // ..............................................................................................................................................................*... + sub v23.4S, v21.4S, v5.4S // .................................................................................................................................................................* + add x1, x1, #64 // ................................................................................................................................................................*. + + // ------------------------------------------------------------------------- new position --------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- + // ldr q15, [x1, #0] // .......*.......................................................................................................................................................... + // ldr q20, [x1, #16] // ......*........................................................................................................................................................... + // ldr q19, [x1, #32] // ....*............................................................................................................................................................. + // ldr q13, [x1, #48] // .....*............................................................................................................................................................ + // trn2 v25.4S, v15.4S, v20.4S // ....................*............................................................................................................................................. + // trn1 v17.4S, v15.4S, v20.4S // ................................*................................................................................................................................. + // ldr q9, [x5, #64] // .................................*................................................................................................................................ + // trn2 v11.4S, v19.4S, v13.4S // ................*................................................................................................................................................. + // trn1 v31.4S, v19.4S, v13.4S // .................*................................................................................................................................................ + // ldr q21, [x5, #128] // ...........................*...................................................................................................................................... + // ldr q3, [x5, #48] // ..................................*............................................................................................................................... + // ldr q13, [x5, #160] // .....................*............................................................................................................................................ + // trn2 v26.2D, v25.2D, v11.2D // .................................................*................................................................................................................ + // trn2 v20.2D, v17.2D, v31.2D // .........................................................*........................................................................................................ + // ldr q24, [x5, #32] // .........................................*........................................................................................................................ + // ldr q23, [x5, #80] // ............................................................*..................................................................................................... + // trn1 v27.2D, v25.2D, v11.2D // .........................*........................................................................................................................................ + // trn1 v11.2D, v17.2D, v31.2D // ........................................*......................................................................................................................... + // ldr q16, [x5, #144] // ...........*...................................................................................................................................................... + // sub v18.4S, v20.4S, v26.4S // .............................................................*.................................................................................................... + // add v25.4S, v20.4S, v26.4S // ..................................................................*............................................................................................... + // sub v14.4S, v11.4S, v27.4S // .............................................*.................................................................................................................... + // ldr q26, [x2, #16] // *................................................................................................................................................................. + // add v20.4S, v11.4S, v27.4S // ......................................................................*........................................................................................... + // ldr q0, [x2, #0] // .*................................................................................................................................................................ + // ldr q17, [x2, #32] // ..*............................................................................................................................................................... + // mul v27.4S, v18.4S, v9.4S // .................................................................*................................................................................................ + // sqrdmulh v7.4S, v18.4S, v23.4S // .....................................................................*............................................................................................ + // sqrdmulh v11.4S, v14.4S, v3.4S // ................................................*................................................................................................................. + // mul v10.4S, v14.4S, v24.4S // ...................................................*.............................................................................................................. + // ldr q1, [x5, #16] // .........................................................................*........................................................................................ + // ldr q24, [x2, #48] // ...*.............................................................................................................................................................. + // ldr q23, [x5, #176] // .......................*.......................................................................................................................................... + // sub v18.4S, v20.4S, v25.4S // ...........................................................................*...................................................................................... + // add v3.4S, v20.4S, v25.4S // ..................................................................................*............................................................................... + // ldr q20, [x5], #(12*16) // ...............................*.................................................................................................................................. + // trn1 v15.4S, v0.4S, v26.4S // ..........*....................................................................................................................................................... + // mls v27.4S, v7.4S, v8.S[0] // .............................................................................*.................................................................................... + // trn2 v0.4S, v0.4S, v26.4S // ........*......................................................................................................................................................... + // mls v10.4S, v11.4S, v8.S[0] // ........................................................*......................................................................................................... + // ldr q14, [x5, #-80] // ............................*..................................................................................................................................... + // ldr q5, [x5, #-96] // ...................*.............................................................................................................................................. + // trn2 v7.4S, v17.4S, v24.4S // .............*.................................................................................................................................................... + // sqrdmulh v26.4S, v18.4S, v1.4S // .................................................................................*................................................................................ + // trn1 v25.4S, v17.4S, v24.4S // ..............*................................................................................................................................................... + // mul v11.4S, v18.4S, v20.4S // ................................................................................*................................................................................. + // trn2 v9.2D, v0.2D, v7.2D // ..................*............................................................................................................................................... + // sub v24.4S, v10.4S, v27.4S // ....................................................................................*............................................................................. + // trn1 v31.2D, v0.2D, v7.2D // ......................*........................................................................................................................................... + // trn2 v18.2D, v15.2D, v25.2D // ........................*......................................................................................................................................... + // mul v7.4S, v24.4S, v20.4S // .......................................................................................*.......................................................................... + // sqrdmulh v24.4S, v24.4S, v1.4S // ........................................................................................*......................................................................... + // trn1 v1.2D, v15.2D, v25.2D // ..........................*....................................................................................................................................... + // sub v25.4S, v18.4S, v9.4S // .............................*.................................................................................................................................... + // mls v11.4S, v26.4S, v8.S[0] // ..............................................................................................*................................................................... + // add v20.4S, v10.4S, v27.4S // ......................................................................................*........................................................................... + // mls v7.4S, v24.4S, v8.S[0] // .............................................................................................*.................................................................... + // sub v0.4S, v1.4S, v31.4S // ..............................*................................................................................................................................... + // trn1 v15.4S, v3.4S, v20.4S // ....................................................................................................*............................................................. + // trn2 v24.4S, v3.4S, v20.4S // .........................................................................................*........................................................................ + // sqrdmulh v3.4S, v25.4S, v23.4S // ....................................*............................................................................................................................. + // mul v23.4S, v0.4S, v21.4S // ......................................*........................................................................................................................... + // trn1 v20.4S, v11.4S, v7.4S // ...................................................................................................*.............................................................. + // trn2 v7.4S, v11.4S, v7.4S // ......................................................................................................*........................................................... + // mul v10.4S, v25.4S, v13.4S // ...................................*.............................................................................................................................. + // sqrdmulh v26.4S, v0.4S, v16.4S // .....................................*............................................................................................................................ + // trn1 v11.2D, v15.2D, v20.2D // ..............................................................................................................*................................................... + // trn1 v13.2D, v24.2D, v7.2D // ............................................................................................................*..................................................... + // ldr q17, [x4, #32] // ..............................................................................*................................................................................... + // ldr q25, [x4, #48] // ...............*.................................................................................................................................................. + // add v9.4S, v18.4S, v9.4S // ..........................................*....................................................................................................................... + // sub v27.4S, v11.4S, v13.4S // .................................................................................................................*................................................ + // ldr q0, [x4, #16] // .........*........................................................................................................................................................ + // mls v23.4S, v26.4S, v8.S[0] // ............................................*..................................................................................................................... + // add v21.4S, v1.4S, v31.4S // .......................................*.......................................................................................................................... + // mls v10.4S, v3.4S, v8.S[0] // ...........................................*...................................................................................................................... + // trn2 v2.2D, v24.2D, v7.2D // ..........................................................................................................*....................................................... + // add v18.4S, v11.4S, v13.4S // ...................................................................................................................*.............................................. + // add v1.4S, v21.4S, v9.4S // ..............................................*................................................................................................................... + // sub v7.4S, v21.4S, v9.4S // ...............................................*.................................................................................................................. + // add v11.4S, v23.4S, v10.4S // ...........................................................*...................................................................................................... + // sub v24.4S, v23.4S, v10.4S // ..................................................*............................................................................................................... + // sqrdmulh v13.4S, v7.4S, v14.4S // ....................................................*............................................................................................................. + // mul v23.4S, v7.4S, v5.4S // .....................................................*............................................................................................................ + // sqrdmulh v7.4S, v24.4S, v14.4S // .......................................................*.......................................................................................................... + // mul v24.4S, v24.4S, v5.4S // ......................................................*........................................................................................................... + // trn2 v3.4S, v1.4S, v11.4S // ................................................................*................................................................................................. + // trn2 v9.2D, v15.2D, v20.2D // ........................................................................................................*......................................................... + // trn1 v20.4S, v1.4S, v11.4S // ...............................................................*.................................................................................................. + // mls v23.4S, v13.4S, v8.S[0] // ..........................................................*....................................................................................................... + // sub v11.4S, v9.4S, v2.4S // .............................................................................................................*.................................................... + // mls v24.4S, v7.4S, v8.S[0] // ..............................................................*................................................................................................... + // ldr q16, [x4], #64 // ............*..................................................................................................................................................... + // mul v10.4S, v27.4S, v0.S[2] // ......................................................................................................................*........................................... + // sqrdmulh v7.4S, v27.4S, v0.S[3] // .....................................................................................................................*............................................ + // add v9.4S, v9.4S, v2.4S // .......................................................................................................................*.......................................... + // sqrdmulh v27.4S, v11.4S, v17.S[1] // ....................................................................................................................*............................................. + // trn2 v1.4S, v23.4S, v24.4S // ...................................................................*.............................................................................................. + // trn1 v21.4S, v23.4S, v24.4S // ....................................................................*............................................................................................. + // mul v24.4S, v11.4S, v17.S[0] // ..................................................................................................................*............................................... + // mls v10.4S, v7.4S, v8.S[0] // ............................................................................................................................*..................................... + // trn2 v7.2D, v3.2D, v1.2D // ........................................................................*......................................................................................... + // trn2 v13.2D, v20.2D, v21.2D // .......................................................................*.......................................................................................... + // trn1 v26.2D, v3.2D, v1.2D // ..........................................................................*....................................................................................... + // trn1 v19.2D, v20.2D, v21.2D // ...............................................................................*.................................................................................. + // add v3.4S, v18.4S, v9.4S // .....................................................................................................................................*............................ + // sub v23.4S, v13.4S, v7.4S // ..................................................................................................*............................................................... + // sub v21.4S, v19.4S, v26.4S // .....................................................................................*............................................................................ + // mls v24.4S, v27.4S, v8.S[0] // ...........................................................................................................................*...................................... + // mul v11.4S, v23.4S, v25.S[0] // .........................................................................................................*........................................................ + // sqrdmulh v27.4S, v23.4S, v25.S[1] // .......................................................................................................*.......................................................... + // add v7.4S, v13.4S, v7.4S // ............................................................................*..................................................................................... + // mul v1.4S, v21.4S, v17.S[2] // .....................................................................................................*............................................................ + // add v20.4S, v19.4S, v26.4S // ...................................................................................*.............................................................................. + // srshr v13.4S, v3.4S, #23 // ..........................................................................................................................................*....................... + // sqrdmulh v21.4S, v21.4S, v17.S[3] // ...........................................................................................*...................................................................... + // add v15.4S, v10.4S, v24.4S // ..................................................................................................................................*............................... + // sub v25.4S, v20.4S, v7.4S // ..........................................................................................*....................................................................... + // sub v23.4S, v18.4S, v9.4S // ..............................................................................................................................*................................... + // mls v11.4S, v27.4S, v8.S[0] // ...............................................................................................................*.................................................. + // add v20.4S, v20.4S, v7.4S // ............................................................................................*..................................................................... + // mls v1.4S, v21.4S, v8.S[0] // ...........................................................................................................*...................................................... + // srshr v27.4S, v15.4S, #23 // ......................................................................................................................................*........................... + // mul v26.4S, v25.4S, v0.S[0] // ...............................................................................................*.................................................................. + // srshr v7.4S, v20.4S, #23 // ................................................................................................*................................................................. + // mls v3.4S, v13.4S, v8.4S // ................................................................................................................................................*................. + // sqrdmulh v25.4S, v25.4S, v0.S[1] // .................................................................................................*................................................................ + // add v9.4S, v1.4S, v11.4S // ........................................................................................................................*......................................... + // sub v24.4S, v10.4S, v24.4S // ....................................................................................................................................*............................. + // sub v1.4S, v1.4S, v11.4S // .........................................................................................................................*........................................ + // mls v15.4S, v27.4S, v8.4S // ............................................................................................................................................*..................... + // srshr v27.4S, v9.4S, #23 // ................................................................................................................................*................................. + // sqrdmulh v11.4S, v24.4S, v16.S[3] // ........................................................................................................................................*......................... + // mls v20.4S, v7.4S, v8.4S // ..........................................................................................................................*....................................... + // mul v13.4S, v24.4S, v16.S[2] // .......................................................................................................................................*.......................... + // mul v24.4S, v1.4S, v0.S[0] // ...............................................................................................................................*.................................. + // sqrdmulh v7.4S, v1.4S, v0.S[1] // .............................................................................................................................*.................................... + // mls v9.4S, v27.4S, v8.4S // ...........................................................................................................................................*...................... + // mul v10.4S, v23.4S, v16.S[2] // ...................................................................................................................................*.............................. + // mls v13.4S, v11.4S, v8.S[0] // ..............................................................................................................................................*................... + // sqrdmulh v23.4S, v23.4S, v16.S[3] // .................................................................................................................................*................................ + // mls v24.4S, v7.4S, v8.S[0] // .............................................................................................................................................*.................... + // mls v26.4S, v25.4S, v8.S[0] // ................................................................................................................*................................................. + // sub v11.4S, v15.4S, v9.4S // .................................................................................................................................................*................ + // add v27.4S, v3.4S, v20.4S // ........................................................................................................................................................*......... + // mls v10.4S, v23.4S, v8.S[0] // .........................................................................................................................................*........................ + // add v9.4S, v15.4S, v9.4S // ..................................................................................................................................................*............... + // add v23.4S, v13.4S, v24.4S // ...................................................................................................................................................*.............. + // sub v24.4S, v13.4S, v24.4S // ....................................................................................................................................................*............. + // str q27, [x1], #(16*4) // ..............................................................................................................................................................*... + // sqrdmulh v7.4S, v11.4S, v16.S[1] // .....................................................................................................................................................*............ + // mul v25.4S, v11.4S, v16.S[0] // .......................................................................................................................................................*.......... + // add v22.4S, v10.4S, v26.4S // ...............................................................................................................................................*.................. + // str q9, [x1, #-48] // ......................................................................................................................................................*........... + // sqrdmulh v27.4S, v24.4S, v16.S[1] // ...........................................................................................................................................................*...... + // str q23, [x1, #-16] // .............................................................................................................................................................*.... + // sub v1.4S, v3.4S, v20.4S // .........................................................................................................................................................*........ + // mul v11.4S, v24.4S, v16.S[0] // ............................................................................................................................................................*..... + // mls v25.4S, v7.4S, v8.S[0] // ...............................................................................................................................................................*.. + // str q22, [x1, #-32] // ..........................................................................................................................................................*....... + // add x1, x1, #64 // .................................................................................................................................................................* + // sub v23.4S, v10.4S, v26.4S // ................................................................................................................................................................*. + + sub count, count, #1 +layer45678_start: + // Instructions: 174 + // Expected cycles: 72 + // Expected IPC: 2.42 + // + // Wall time: 2084.65s + // User time: 2084.65s + // + // ----------------------------------------------------------------------------- original position -----------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------------- + sqrdmulh v9.4S, v1.4S, v16.S[1] // ...................................................................................................................................................*.......................... + ldr q15, [x1, #0] // e............................................................................................................................................................................. + mul v1.4S, v1.4S, v16.S[0] // ..................................................................................................................................................*........................... + ldr q20, [x1, #16] // .e............................................................................................................................................................................ + sqrdmulh v24.4S, v23.4S, v16.S[1] // .............................................................................................................................................................*................ + // gap // .............................................................................................................................................................................. + ldr q19, [x1, #32] // ..e........................................................................................................................................................................... + ldr q13, [x1, #48] // ...e.......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + str q25, [x2, #16] // .........................................................................................................................................................................*.... + mls v11.4S, v27.4S, v8.S[0] // ...................................................................................................................................................................*.......... + mul v7.4S, v23.4S, v16.S[0] // ............................................................................................................................................................*................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v1.4S, v9.4S, v8.S[0] // ....................................................................................................................................................*......................... + trn2 v25.4S, v15.4S, v20.4S // .....e........................................................................................................................................................................ + trn1 v17.4S, v15.4S, v20.4S // ....e......................................................................................................................................................................... + // gap // .............................................................................................................................................................................. + ldr q9, [x5, #64] // ............................e................................................................................................................................................. + str q11, [x2, #48] // ...........................................................................................................................................................................*.. + trn2 v11.4S, v19.4S, v13.4S // .......e...................................................................................................................................................................... + trn1 v31.4S, v19.4S, v13.4S // ......e....................................................................................................................................................................... + ldr q21, [x5, #128] // ....................................................e......................................................................................................................... + mls v7.4S, v24.4S, v8.S[0] // ..............................................................................................................................................................*............... + // gap // .............................................................................................................................................................................. + ldr q3, [x5, #48] // ...........................e.................................................................................................................................................. + ldr q13, [x5, #160] // ......................................................e....................................................................................................................... + trn2 v26.2D, v25.2D, v11.2D // .........e.................................................................................................................................................................... + trn2 v20.2D, v17.2D, v31.2D // ........e..................................................................................................................................................................... + ldr q24, [x5, #32] // ..........................e................................................................................................................................................... + ldr q23, [x5, #80] // .............................e................................................................................................................................................ + trn1 v27.2D, v25.2D, v11.2D // ...........e.................................................................................................................................................................. + trn1 v11.2D, v17.2D, v31.2D // ..........e................................................................................................................................................................... + str q1, [x2], #(16*4) // ........................................................................................................................................................................*..... + ldr q16, [x5, #144] // .....................................................e........................................................................................................................ + str q7, [x2, #-32] // ..........................................................................................................................................................................*... + add x2, x2, #64 // .............................................................................................................................................................................* + sub v18.4S, v20.4S, v26.4S // ...................................e.......................................................................................................................................... + add v25.4S, v20.4S, v26.4S // ....................................e......................................................................................................................................... + sub v14.4S, v11.4S, v27.4S // ..............................e............................................................................................................................................... + ldr q26, [x2, #16] // .............e................................................................................................................................................................ + add v20.4S, v11.4S, v27.4S // ...............................e.............................................................................................................................................. + ldr q0, [x2, #0] // ............e................................................................................................................................................................. + ldr q17, [x2, #32] // ..............e............................................................................................................................................................... + mul v27.4S, v18.4S, v9.4S // .....................................e........................................................................................................................................ + sqrdmulh v7.4S, v18.4S, v23.4S // ......................................e....................................................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v11.4S, v14.4S, v3.4S // .................................e............................................................................................................................................ + mul v10.4S, v14.4S, v24.4S // ................................e............................................................................................................................................. + ldr q1, [x5, #16] // .........................e.................................................................................................................................................... + ldr q24, [x2, #48] // ...............e.............................................................................................................................................................. + ldr q23, [x5, #176] // .......................................................e...................................................................................................................... + sub v18.4S, v20.4S, v25.4S // ........................................e..................................................................................................................................... + add v3.4S, v20.4S, v25.4S // .........................................e.................................................................................................................................... + ldr q20, [x5], #(12*16) // ........................e..................................................................................................................................................... + trn1 v15.4S, v0.4S, v26.4S // ................e............................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v27.4S, v7.4S, v8.S[0] // .......................................e...................................................................................................................................... + trn2 v0.4S, v0.4S, v26.4S // .................e............................................................................................................................................................ + mls v10.4S, v11.4S, v8.S[0] // ..................................e........................................................................................................................................... + // gap // .............................................................................................................................................................................. + ldr q14, [x5, #-80] // ...................................................e.......................................................................................................................... + ldr q5, [x5, #-96] // ..................................................e........................................................................................................................... + trn2 v7.4S, v17.4S, v24.4S // ...................e.......................................................................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v26.4S, v18.4S, v1.4S // ...........................................e.................................................................................................................................. + trn1 v25.4S, v17.4S, v24.4S // ..................e........................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v11.4S, v18.4S, v20.4S // ..........................................e................................................................................................................................... + trn2 v9.2D, v0.2D, v7.2D // .....................e........................................................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v24.4S, v10.4S, v27.4S // .............................................e................................................................................................................................ + trn1 v31.2D, v0.2D, v7.2D // .......................e...................................................................................................................................................... + trn2 v18.2D, v15.2D, v25.2D // ....................e......................................................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v7.4S, v24.4S, v20.4S // ...............................................e.............................................................................................................................. + sqrdmulh v24.4S, v24.4S, v1.4S // ................................................e............................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v1.2D, v15.2D, v25.2D // ......................e....................................................................................................................................................... + // gap // .............................................................................................................................................................................. + sub v25.4S, v18.4S, v9.4S // .............................................................e................................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v11.4S, v26.4S, v8.S[0] // ............................................e................................................................................................................................. + add v20.4S, v10.4S, v27.4S // ..............................................e............................................................................................................................... + mls v7.4S, v24.4S, v8.S[0] // .................................................e............................................................................................................................ + sub v0.4S, v1.4S, v31.4S // ........................................................e..................................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v15.4S, v3.4S, v20.4S // ............................................................................e................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v24.4S, v3.4S, v20.4S // .............................................................................e................................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v3.4S, v25.4S, v23.4S // ................................................................e............................................................................................................. + mul v23.4S, v0.4S, v21.4S // ..........................................................e................................................................................................................... + trn1 v20.4S, v11.4S, v7.4S // ..............................................................................e............................................................................................... + trn2 v7.4S, v11.4S, v7.4S // ...............................................................................e.............................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v10.4S, v25.4S, v13.4S // ...............................................................e.............................................................................................................. + sqrdmulh v26.4S, v0.4S, v16.4S // ...........................................................e.................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v11.2D, v15.2D, v20.2D // ..................................................................................e........................................................................................... + trn1 v13.2D, v24.2D, v7.2D // ...................................................................................e.......................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q17, [x4, #32] // ..............................................................................................e............................................................................... + ldr q25, [x4, #48] // ...............................................................................................e.............................................................................. + add v9.4S, v18.4S, v9.4S // ..............................................................e............................................................................................................... + sub v27.4S, v11.4S, v13.4S // ................................................................................................e............................................................................. + // gap // .............................................................................................................................................................................. + ldr q0, [x4, #16] // .............................................................................................e................................................................................ + mls v23.4S, v26.4S, v8.S[0] // ............................................................e................................................................................................................. + add v21.4S, v1.4S, v31.4S // .........................................................e.................................................................................................................... + // gap // .............................................................................................................................................................................. + mls v10.4S, v3.4S, v8.S[0] // .................................................................e............................................................................................................ + // gap // .............................................................................................................................................................................. + trn2 v2.2D, v24.2D, v7.2D // .................................................................................e............................................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v18.4S, v11.4S, v13.4S // .................................................................................................e............................................................................ + add v1.4S, v21.4S, v9.4S // ...................................................................e.......................................................................................................... + sub v7.4S, v21.4S, v9.4S // ..................................................................e........................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v11.4S, v23.4S, v10.4S // ........................................................................e..................................................................................................... + sub v24.4S, v23.4S, v10.4S // .......................................................................e...................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v13.4S, v7.4S, v14.4S // .....................................................................e........................................................................................................ + mul v23.4S, v7.4S, v5.4S // ....................................................................e......................................................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v7.4S, v24.4S, v14.4S // ..........................................................................e................................................................................................... + mul v24.4S, v24.4S, v5.4S // .........................................................................e.................................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn2 v3.4S, v1.4S, v11.4S // .....................................................................................e........................................................................................ + trn2 v9.2D, v15.2D, v20.2D // ................................................................................e............................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v20.4S, v1.4S, v11.4S // ....................................................................................e......................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v23.4S, v13.4S, v8.S[0] // ......................................................................e....................................................................................................... + sub v11.4S, v9.4S, v2.4S // .....................................................................................................e........................................................................ + mls v24.4S, v7.4S, v8.S[0] // ...........................................................................e.................................................................................................. + // gap // .............................................................................................................................................................................. + ldr q16, [x4], #64 // ............................................................................................e................................................................................. + mul v10.4S, v27.4S, v0.S[2] // ..................................................................................................e........................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v7.4S, v27.4S, v0.S[3] // ...................................................................................................e.......................................................................... + add v9.4S, v9.4S, v2.4S // ......................................................................................................e....................................................................... + // gap // .............................................................................................................................................................................. + sqrdmulh v27.4S, v11.4S, v17.S[1] // ........................................................................................................e..................................................................... + // gap // .............................................................................................................................................................................. + trn2 v1.4S, v23.4S, v24.4S // .......................................................................................e...................................................................................... + trn1 v21.4S, v23.4S, v24.4S // ......................................................................................e....................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v24.4S, v11.4S, v17.S[0] // .......................................................................................................e...................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v10.4S, v7.4S, v8.S[0] // ....................................................................................................e......................................................................... + trn2 v7.2D, v3.2D, v1.2D // .........................................................................................e.................................................................................... + trn2 v13.2D, v20.2D, v21.2D // ........................................................................................e..................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + trn1 v26.2D, v3.2D, v1.2D // ...........................................................................................e.................................................................................. + trn1 v19.2D, v20.2D, v21.2D // ..........................................................................................e................................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v3.4S, v18.4S, v9.4S // .....................................................................................................................e........................................................ + sub v23.4S, v13.4S, v7.4S // ...............................................................................................................e.............................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v21.4S, v19.4S, v26.4S // ..........................................................................................................e................................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v24.4S, v27.4S, v8.S[0] // .........................................................................................................e.................................................................... + mul v11.4S, v23.4S, v25.S[0] // .................................................................................................................e............................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v27.4S, v23.4S, v25.S[1] // ..................................................................................................................e........................................................... + add v7.4S, v13.4S, v7.4S // ................................................................................................................e............................................................. + mul v1.4S, v21.4S, v17.S[2] // ............................................................................................................e................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v20.4S, v19.4S, v26.4S // ...........................................................................................................e.................................................................. + srshr v13.4S, v3.4S, #23 // ........................................................................................................................................e..................................... + sqrdmulh v21.4S, v21.4S, v17.S[3] // .............................................................................................................e................................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v15.4S, v10.4S, v24.4S // ..........................................................................................................................e................................................... + sub v25.4S, v20.4S, v7.4S // ..............................................................................................................................e............................................... + sub v23.4S, v18.4S, v9.4S // ....................................................................................................................e......................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v11.4S, v27.4S, v8.S[0] // ...................................................................................................................e.......................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v20.4S, v20.4S, v7.4S // ...............................................................................................................................e.............................................. + mls v1.4S, v21.4S, v8.S[0] // ..............................................................................................................e............................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + srshr v27.4S, v15.4S, #23 // ..........................................................................................................................................e................................... + // gap // .............................................................................................................................................................................. + mul v26.4S, v25.4S, v0.S[0] // ................................................................................................................................e............................................. + srshr v7.4S, v20.4S, #23 // ............................................................................................................................................e................................. + // gap // .............................................................................................................................................................................. + mls v3.4S, v13.4S, v8.4S // .........................................................................................................................................e.................................... + sqrdmulh v25.4S, v25.4S, v0.S[1] // .................................................................................................................................e............................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v9.4S, v1.4S, v11.4S // ....................................................................................................................................e......................................... + sub v24.4S, v10.4S, v24.4S // .........................................................................................................................e.................................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v1.4S, v1.4S, v11.4S // ...................................................................................................................................e.......................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v15.4S, v27.4S, v8.4S // ...........................................................................................................................................e.................................. + // gap // .............................................................................................................................................................................. + srshr v27.4S, v9.4S, #23 // ..............................................................................................................................................e............................... + sqrdmulh v11.4S, v24.4S, v16.S[3] // ............................................................................................................................e................................................. + // gap // .............................................................................................................................................................................. + mls v20.4S, v7.4S, v8.4S // .............................................................................................................................................e................................ + mul v13.4S, v24.4S, v16.S[2] // ...........................................................................................................................e.................................................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v24.4S, v1.4S, v0.S[0] // .....................................................................................................................................e........................................ + sqrdmulh v7.4S, v1.4S, v0.S[1] // ......................................................................................................................................e....................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v9.4S, v27.4S, v8.4S // ...............................................................................................................................................e.............................. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mul v10.4S, v23.4S, v16.S[2] // ......................................................................................................................e....................................................... + mls v13.4S, v11.4S, v8.S[0] // .............................................................................................................................e................................................ + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sqrdmulh v23.4S, v23.4S, v16.S[3] // .......................................................................................................................e...................................................... + mls v24.4S, v7.4S, v8.S[0] // .......................................................................................................................................e...................................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + mls v26.4S, v25.4S, v8.S[0] // ..................................................................................................................................e........................................... + sub v11.4S, v15.4S, v9.4S // .....................................................................................................................................................e........................ + // gap // .............................................................................................................................................................................. + add v27.4S, v3.4S, v20.4S // .................................................................................................................................................e............................ + // gap // .............................................................................................................................................................................. + mls v10.4S, v23.4S, v8.S[0] // ........................................................................................................................e..................................................... + add v9.4S, v15.4S, v9.4S // ......................................................................................................................................................e....................... + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + add v23.4S, v13.4S, v24.4S // ................................................................................................................................................................e............. + // gap // .............................................................................................................................................................................. + // gap // .............................................................................................................................................................................. + sub v24.4S, v13.4S, v24.4S // ...............................................................................................................................................................e.............. + str q27, [x1], #(16*4) // ....................................................................................................................................................................e......... + sqrdmulh v7.4S, v11.4S, v16.S[1] // ........................................................................................................................................................e..................... + // gap // .............................................................................................................................................................................. + mul v25.4S, v11.4S, v16.S[0] // .......................................................................................................................................................e...................... + add v22.4S, v10.4S, v26.4S // ...........................................................................................................................................................e.................. + str q9, [x1, #-48] // .....................................................................................................................................................................e........ + // gap // .............................................................................................................................................................................. + sqrdmulh v27.4S, v24.4S, v16.S[1] // ..................................................................................................................................................................e........... + str q23, [x1, #-16] // .......................................................................................................................................................................e...... + sub v1.4S, v3.4S, v20.4S // ................................................................................................................................................e............................. + mul v11.4S, v24.4S, v16.S[0] // .................................................................................................................................................................e............ + // gap // .............................................................................................................................................................................. + mls v25.4S, v7.4S, v8.S[0] // .........................................................................................................................................................e.................... + str q22, [x1, #-32] // ......................................................................................................................................................................e....... + add x1, x1, #64 // ............................................................................................................................................................................e. + sub v23.4S, v10.4S, v26.4S // ..........................................................................................................................................................e................... + + // ---------------------------------------------------------------------------------------------- new position -----------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|--- + // ldr q9, [x1, #0] // e............................................................................................................................................................................'~............................. + // ldr q10, [x1, #16] // ..e..........................................................................................................................................................................'..~........................... + // ldr q11, [x1, #32] // ....e........................................................................................................................................................................'....~......................... + // ldr q12, [x1, #48] // .....e.......................................................................................................................................................................'.....~........................ + // trn1 v25.4s, v9.4s, v10.4s // ...........e.................................................................................................................................................................'...........~.................. + // trn2 v26.4s, v9.4s, v10.4s // ..........e..................................................................................................................................................................'..........~................... + // trn1 v27.4s, v11.4s, v12.4s // ...............e.............................................................................................................................................................'...............~.............. + // trn2 v28.4s, v11.4s, v12.4s // ..............e..............................................................................................................................................................'..............~............... + // trn2 v11.2d, v25.2d, v27.2d // .....................e.......................................................................................................................................................'.....................~........ + // trn2 v12.2d, v26.2d, v28.2d // ....................e........................................................................................................................................................'....................~......... + // trn1 v9.2d, v25.2d, v27.2d // .........................e...................................................................................................................................................'.........................~.... + // trn1 v10.2d, v26.2d, v28.2d // ........................e....................................................................................................................................................'........................~..... + // ldr q13, [x2, #0] // ...................................e.........................................................................................................................................'.............................. + // ldr q14, [x2, #16] // .................................e...........................................................................................................................................'.............................. + // ldr q15, [x2, #32] // ....................................e........................................................................................................................................'.............................. + // ldr q16, [x2, #48] // ..........................................e..................................................................................................................................'.............................. + // trn1 v25.4s, v13.4s, v14.4s // ...............................................e.............................................................................................................................'.............................. + // trn2 v26.4s, v13.4s, v14.4s // .................................................e...........................................................................................................................'.............................. + // trn1 v27.4s, v15.4s, v16.4s // .......................................................e.....................................................................................................................'.............................. + // trn2 v28.4s, v15.4s, v16.4s // .....................................................e.......................................................................................................................'.............................. + // trn2 v15.2d, v25.2d, v27.2d // ............................................................e................................................................................................................'.............................. + // trn2 v16.2d, v26.2d, v28.2d // .........................................................e...................................................................................................................'.............................. + // trn1 v13.2d, v25.2d, v27.2d // ...............................................................e.............................................................................................................'.............................. + // trn1 v14.2d, v26.2d, v28.2d // ...........................................................e.................................................................................................................'.............................. + // ldr q0, [x5], #(12*16) // ..............................................e..............................................................................................................................'.............................. + // ldr q4, [x5, #(-12*16 + 1*16)] // .........................................e...................................................................................................................................'.............................. + // ldr q1, [x5, #(-12*16 + 2*16)] // ......................e......................................................................................................................................................'......................~....... + // ldr q5, [x5, #(-12*16 + 3*16)] // ..................e..........................................................................................................................................................'..................~........... + // ldr q2, [x5, #(-12*16 + 4*16)] // ............e................................................................................................................................................................'............~................. + // ldr q6, [x5, #(-12*16 + 5*16)] // .......................e.....................................................................................................................................................'.......................~...... + // sub v24.4s, v9.4s, v10.4s // ................................e............................................................................................................................................'.............................. + // add v9.4s, v9.4s, v10.4s // ..................................e..........................................................................................................................................'.............................. + // mul v10.4s, v24.4s, v1.4s // ........................................e....................................................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................e.....................................................................................................................................'.............................. + // mls v10.4s, v24.4s, v8.s[0] // ..................................................e..........................................................................................................................'.............................. + // sub v24.4s, v11.4s, v12.4s // ..............................e..............................................................................................................................................'.............................. + // add v11.4s, v11.4s, v12.4s // ...............................e.............................................................................................................................................'.............................. + // mul v12.4s, v24.4s, v2.4s // .....................................e.......................................................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ......................................e......................................................................................................................................'.............................. + // mls v12.4s, v24.4s, v8.s[0] // ................................................e............................................................................................................................'.............................. + // sub v24.4s, v9.4s, v11.4s // ............................................e................................................................................................................................'.............................. + // add v9.4s, v9.4s, v11.4s // .............................................e...............................................................................................................................'.............................. + // mul v11.4s, v24.4s, v0.4s // ........................................................e....................................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ......................................................e......................................................................................................................'.............................. + // mls v11.4s, v24.4s, v8.s[0] // .................................................................e...........................................................................................................'.............................. + // sub v24.4s, v10.4s, v12.4s // ..........................................................e..................................................................................................................'.............................. + // add v10.4s, v10.4s, v12.4s // ..................................................................e..........................................................................................................'.............................. + // mul v12.4s, v24.4s, v0.4s // .............................................................e...............................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ..............................................................e..............................................................................................................'.............................. + // mls v12.4s, v24.4s, v8.s[0] // ...................................................................e.........................................................................................................'.............................. + // ldr q0, [x5, #(-12*16 + 6*16)] // ....................................................e........................................................................................................................'.............................. + // ldr q4, [x5, #(-12*16 + 7*16)] // ...................................................e.........................................................................................................................'.............................. + // ldr q1, [x5, #(-12*16 + 8*16)] // ................e............................................................................................................................................................'................~............. + // ldr q5, [x5, #(-12*16 + 9*16)] // ...........................e.................................................................................................................................................'...........................~.. + // ldr q2, [x5, #(-12*16 + 10*16)] // ...................e.........................................................................................................................................................'...................~.......... + // ldr q6, [x5, #(-12*16 + 11*16)] // ...........................................e.................................................................................................................................'.............................. + // sub v24.4s, v13.4s, v14.4s // ....................................................................e........................................................................................................'.............................. + // add v13.4s, v13.4s, v14.4s // .....................................................................................e.......................................................................................'.............................. + // mul v14.4s, v24.4s, v1.4s // ........................................................................e....................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v5.4s // ............................................................................e................................................................................................'.............................. + // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................e........................................................................................'.............................. + // sub v24.4s, v15.4s, v16.4s // ................................................................e............................................................................................................'.............................. + // add v15.4s, v15.4s, v16.4s // .................................................................................e...........................................................................................'.............................. + // mul v16.4s, v24.4s, v2.4s // ...........................................................................e.................................................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v6.4s // .......................................................................e.....................................................................................................'.............................. + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................e......................................................................................'.............................. + // sub v24.4s, v13.4s, v15.4s // ..........................................................................................e..................................................................................'.............................. + // add v13.4s, v13.4s, v15.4s // .........................................................................................e...................................................................................'.............................. + // mul v15.4s, v24.4s, v0.4s // ..............................................................................................e..............................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................................................................e...............................................................................'.............................. + // mls v15.4s, v24.4s, v8.s[0] // ....................................................................................................e........................................................................'.............................. + // sub v24.4s, v14.4s, v16.4s // ............................................................................................e................................................................................'.............................. + // add v14.4s, v14.4s, v16.4s // ...........................................................................................e.................................................................................'.............................. + // mul v16.4s, v24.4s, v0.4s // ................................................................................................e............................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................................................................e.............................................................................'.............................. + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................e......................................................................'.............................. + // trn1 v25.4s, v9.4s, v10.4s // .....................................................................e.......................................................................................................'.............................. + // trn2 v26.4s, v9.4s, v10.4s // ......................................................................e......................................................................................................'.............................. + // trn1 v27.4s, v11.4s, v12.4s // .........................................................................e...................................................................................................'.............................. + // trn2 v28.4s, v11.4s, v12.4s // ..........................................................................e..................................................................................................'.............................. + // trn2 v11.2d, v25.2d, v27.2d // ..................................................................................................e..........................................................................'.............................. + // trn2 v12.2d, v26.2d, v28.2d // .......................................................................................e.....................................................................................'.............................. + // trn1 v9.2d, v25.2d, v27.2d // .............................................................................e...............................................................................................'.............................. + // trn1 v10.2d, v26.2d, v28.2d // ..............................................................................e..............................................................................................'.............................. + // trn1 v25.4s, v13.4s, v14.4s // ...................................................................................................e.........................................................................'.............................. + // trn2 v26.4s, v13.4s, v14.4s // .................................................................................................e...........................................................................'.............................. + // trn1 v27.4s, v15.4s, v16.4s // .............................................................................................................e...............................................................'.............................. + // trn2 v28.4s, v15.4s, v16.4s // ............................................................................................................e................................................................'.............................. + // trn2 v15.2d, v25.2d, v27.2d // .................................................................................................................e...........................................................'.............................. + // trn2 v16.2d, v26.2d, v28.2d // ................................................................................................................e............................................................'.............................. + // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................................................e.........................................................'.............................. + // trn1 v14.2d, v26.2d, v28.2d // ..................................................................................................................e..........................................................'.............................. + // ldr q0, [x4], #64 // .......................................................................................................e.....................................................................'.............................. + // ldr q1, [x4, #(-64 + 16)] // ...................................................................................e.........................................................................................'.............................. + // ldr q2, [x4, #(-64 + 32)] // ...............................................................................e.............................................................................................'.............................. + // ldr q3, [x4, #(-64 + 48)] // ................................................................................e............................................................................................'.............................. + // sub v24.4s, v9.4s, v10.4s // ..................................................................................e..........................................................................................'.............................. + // add v9.4s, v9.4s, v10.4s // ........................................................................................e....................................................................................'.............................. + // mul v10.4s, v24.4s, v1.s[2] // ........................................................................................................e....................................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .........................................................................................................e...................................................................'.............................. + // mls v10.4s, v24.4s, v8.s[0] // ...............................................................................................................e.............................................................'.............................. + // sub v24.4s, v11.4s, v12.4s // .....................................................................................................e.......................................................................'.............................. + // add v11.4s, v11.4s, v12.4s // ..........................................................................................................e..................................................................'.............................. + // mul v12.4s, v24.4s, v2.s[0] // ..............................................................................................................e..............................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...........................................................................................................e.................................................................'.............................. + // mls v12.4s, v24.4s, v8.s[0] // .......................................................................................................................e.....................................................'.............................. + // sub v24.4s, v13.4s, v14.4s // ......................................................................................................................e......................................................'.............................. + // add v13.4s, v13.4s, v14.4s // ............................................................................................................................e................................................'.............................. + // mul v14.4s, v24.4s, v2.s[2] // ...........................................................................................................................e.................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..............................................................................................................................e..............................................'.............................. + // mls v14.4s, v24.4s, v8.s[0] // ....................................................................................................................................e........................................'.............................. + // sub v24.4s, v15.4s, v16.4s // .....................................................................................................................e.......................................................'.............................. + // add v15.4s, v15.4s, v16.4s // ..........................................................................................................................e..................................................'.............................. + // mul v16.4s, v24.4s, v3.s[0] // ........................................................................................................................e....................................................'.............................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // .........................................................................................................................e...................................................'.............................. + // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................................e..........................................'.............................. + // sub v24.4s, v9.4s, v11.4s // .................................................................................................................................e...........................................'.............................. + // add v9.4s, v9.4s, v11.4s // ....................................................................................................................e........................................................'.............................. + // mul v11.4s, v24.4s, v0.s[2] // .....................................................................................................................................................e.......................'.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................................e.....................'.............................. + // mls v11.4s, v24.4s, v8.s[0] // ............................................................................................................................................................e................'.............................. + // sub v24.4s, v10.4s, v12.4s // ...........................................................................................................................................e.................................'.............................. + // add v10.4s, v10.4s, v12.4s // ...............................................................................................................................e.............................................'.............................. + // mul v12.4s, v24.4s, v0.s[2] // .................................................................................................................................................e...........................'.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...............................................................................................................................................e.............................'.............................. + // mls v12.4s, v24.4s, v8.s[0] // ......................................................................................................................................................e......................'.............................. + // sub v24.4s, v13.4s, v15.4s // ................................................................................................................................e............................................'.............................. + // add v13.4s, v13.4s, v15.4s // ...................................................................................................................................e.........................................'.............................. + // mul v15.4s, v24.4s, v1.s[0] // ......................................................................................................................................e......................................'.............................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................................................................................................................e...................................'.............................. + // mls v15.4s, v24.4s, v8.s[0] // .........................................................................................................................................................e...................'.............................. + // sub v24.4s, v14.4s, v16.4s // ............................................................................................................................................e................................'.............................. + // add v14.4s, v14.4s, v16.4s // ..........................................................................................................................................e..................................'.............................. + // mul v16.4s, v24.4s, v1.s[0] // ..................................................................................................................................................e..........................'.............................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................................e.........................'.............................. + // mls v16.4s, v24.4s, v8.s[0] // ........................................................................................................................................................e....................'.............................. + // srshr v24.4S, v9.4S, #23 // .............................................................................................................................e...............................................'.............................. + // mls v9.4s, v24.4s, v8.4s // ........................................................................................................................................e....................................'.............................. + // srshr v24.4S, v10.4S, #23 // .....................................................................................................................................e.......................................'.............................. + // mls v10.4s, v24.4s, v8.4s // .............................................................................................................................................e...............................'.............................. + // srshr v24.4S, v13.4S, #23 // .......................................................................................................................................e.....................................'.............................. + // mls v13.4s, v24.4s, v8.4s // ................................................................................................................................................e............................'.............................. + // srshr v24.4S, v14.4S, #23 // ..............................................................................................................................................e..............................'.............................. + // mls v14.4s, v24.4s, v8.4s // ....................................................................................................................................................e........................'.............................. + // sub v24.4s, v9.4s, v13.4s // .......................................................................................................................................................................e.....'.............................. + // add v9.4s, v9.4s, v13.4s // ...........................................................................................................................................................e.................'.............................. + // mul v13.4s, v24.4s, v0.s[0] // .~...........................................................................................................................................................................'.*............................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................................................................................................................*.............................. + // mls v13.4s, v24.4s, v8.s[0] // .........~...................................................................................................................................................................'.........*.................... + // sub v24.4s, v10.4s, v14.4s // ..........................................................................................................................................................e..................'.............................. + // add v10.4s, v10.4s, v14.4s // .............................................................................................................................................................e...............'.............................. + // mul v14.4s, v24.4s, v0.s[0] // ..................................................................................................................................................................e..........'.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................................................................................................................................e...........'.............................. + // mls v14.4s, v24.4s, v8.s[0] // .........................................................................................................................................................................e...'.............................. + // sub v24.4s, v11.4s, v15.4s // ............................................................................................................................................................................e'.............................. + // add v11.4s, v11.4s, v15.4s // ...................................................................................................................................................................e.........'.............................. + // mul v15.4s, v24.4s, v0.s[0] // ........~....................................................................................................................................................................'........*..................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...~.........................................................................................................................................................................'...*.......................... + // mls v15.4s, v24.4s, v8.s[0] // .................~...........................................................................................................................................................'.................*............ + // sub v24.4s, v12.4s, v16.4s // ...............................................................................................................................................................e.............'.............................. + // add v12.4s, v12.4s, v16.4s // ..............................................................................................................................................................e..............'.............................. + // mul v16.4s, v24.4s, v0.s[0] // ........................................................................................................................................................................e....'.............................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................................................................................................................................e.......'.............................. + // mls v16.4s, v24.4s, v8.s[0] // .......~.....................................................................................................................................................................'.......*...................... + // str q9, [x1], #(16*4) // ................................................................................................................................................................e............'.............................. + // str q10, [x1, #(-16*4 + 1*16)] // ....................................................................................................................................................................e........'.............................. + // str q11, [x1, #(-16*4 + 2*16)] // ..........................................................................................................................................................................e..'.............................. + // str q12, [x1, #(-16*4 + 3*16)] // ......................................................................................................................................................................e......'.............................. + // str q13, [x2], #(16*4) // ..........................~..................................................................................................................................................'..........................*... + // str q14, [x2, #(-16*4 + 1*16)] // ......~......................................................................................................................................................................'......*....................... + // str q15, [x2, #(-16*4 + 2*16)] // ............................~................................................................................................................................................'............................*. + // str q16, [x2, #(-16*4 + 3*16)] // .............~...............................................................................................................................................................'.............*................ + // add x1, x1, #64 // ...........................................................................................................................................................................e.'.............................. + // add x2, x2, #64 // .............................~...............................................................................................................................................'.............................* + + sub count, count, #1 + cbnz count, layer45678_start + // Instructions: 12 + // Expected cycles: 8 + // Expected IPC: 1.50 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + mul v10.4S, v1.4S, v16.S[0] // .*............................ + sqrdmulh v24.4S, v1.4S, v16.S[1] // *............................. + // gap // .............................. + // gap // .............................. + mul v7.4S, v23.4S, v16.S[0] // .....*........................ + sqrdmulh v9.4S, v23.4S, v16.S[1] // ..*........................... + // gap // .............................. + // gap // .............................. + mls v11.4S, v27.4S, v8.S[0] // ....*......................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v10.4S, v24.4S, v8.S[0] // ......*....................... + mls v7.4S, v9.4S, v8.S[0] // ........*..................... + // gap // .............................. + // gap // .............................. + str q25, [x2, #16] // ...*.......................... + // gap // .............................. + // gap // .............................. + str q11, [x2, #48] // .......*...................... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q10, [x2], #(16*4) // .........*.................... + // gap // .............................. + str q7, [x2, #-32] // ..........*................... + // gap // .............................. + add x2, x2, #64 // ...........*.................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sqrdmulh v9.4S, v1.4S, v16.S[1] // .*............................. + // mul v1.4S, v1.4S, v16.S[0] // *.............................. + // sqrdmulh v24.4S, v23.4S, v16.S[1] // ...*........................... + // str q25, [x2, #16] // .......*....................... + // mls v11.4S, v27.4S, v8.S[0] // ....*.......................... + // mul v7.4S, v23.4S, v16.S[0] // ..*............................ + // mls v1.4S, v9.4S, v8.S[0] // .....*......................... + // str q11, [x2, #48] // ........*...................... + // mls v7.4S, v24.4S, v8.S[0] // ......*........................ + // str q1, [x2], #(16*4) // .........*..................... + // str q7, [x2, #-32] // ..........*.................... + // add x2, x2, #64 // ...........*................... + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + // Instructions: 95 + // Expected cycles: 47 + // Expected IPC: 2.02 + // + // Wall time: 38.83s + // User time: 38.83s + // + // ------------------------------------- original position --------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------------- + // gap // ............................................................................................... + // gap // ............................................................................................... + ldr q7, [x0, #768] // .*............................................................................................. + ldr q11, [x0, #896] // ....*.......................................................................................... + ldr q23, [x0, #640] // ...*........................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + ldr q20, [x0, #512] // ........*...................................................................................... + ldr q21, [x0, #128] // .....*......................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + ldr q9, [x0, #0] // ......*........................................................................................ + ldr q15, [x0, #256] // ..*............................................................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v27.4S, v7.4S, v11.4S // ..............................*................................................................ + add v5.4S, v7.4S, v11.4S // .........*..................................................................................... + ldr q28, [x0, #384] // *.............................................................................................. + // gap // ............................................................................................... + sub v7.4S, v20.4S, v23.4S // ..............*................................................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + add v17.4S, v20.4S, v23.4S // ...............*............................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v23.4S, v27.4S, v3.S[0] // ....................................*.......................................................... + sqrdmulh v27.4S, v27.4S, v3.S[1] // .....................................*......................................................... + sqrdmulh v11.4S, v7.4S, v2.S[3] // ..........................*.................................................................... + mul v13.4S, v7.4S, v2.S[2] // ............................*.................................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v16.4S, v9.4S, v21.4S // ............*.................................................................................. + add v24.4S, v15.4S, v28.4S // ...........*................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v22.4S, v15.4S, v28.4S // .......*....................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + add v7.4S, v9.4S, v21.4S // .............*................................................................................. + mls v13.4S, v11.4S, v8.S[0] // .........................................*..................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v23.4S, v27.4S, v8.S[0] // ..........................................*.................................................... + mul v20.4S, v16.4S, v1.S[2] // .........................*..................................................................... + // gap // ............................................................................................... + sub v27.4S, v7.4S, v24.4S // .................*............................................................................. + // gap // ............................................................................................... + add v18.4S, v7.4S, v24.4S // ........................*...................................................................... + sub v9.4S, v17.4S, v5.4S // ...................*........................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v24.4S, v22.4S, v2.S[1] // .................................*............................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v11.4S, v13.4S, v23.4S // ....................................................*.......................................... + mul v14.4S, v27.4S, v0.S[2] // ....................*.......................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v15.4S, v27.4S, v0.S[3] // .....................*......................................................................... + sqrdmulh v27.4S, v11.4S, v1.S[1] // ........................................................*...................................... + mul v7.4S, v11.4S, v1.S[0] // .......................................................*....................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v21.4S, v9.4S, v1.S[0] // .......................*....................................................................... + sqrdmulh v11.4S, v9.4S, v1.S[1] // ......................*........................................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v16.4S, v16.4S, v1.S[3] // ................*.............................................................................. + mul v9.4S, v22.4S, v2.S[0] // ..........*.................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v7.4S, v27.4S, v8.S[0] // ..............................................................*................................ + add v27.4S, v17.4S, v5.4S // ..................*............................................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v21.4S, v11.4S, v8.S[0] // .............................*................................................................. + mls v14.4S, v15.4S, v8.S[0] // ...........................*................................................................... + // gap // ............................................................................................... + mls v9.4S, v24.4S, v8.S[0] // ........................................*...................................................... + mls v20.4S, v16.4S, v8.S[0] // ................................*.............................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v4.4S, v18.4S, v27.4S // ...............................*............................................................... + add v17.4S, v18.4S, v27.4S // ..................................*............................................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + add v24.4S, v13.4S, v23.4S // ...............................................*............................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v11.4S, v14.4S, v21.4S // ...................................*........................................................... + add v13.4S, v20.4S, v9.4S // ..............................................*................................................ + sub v27.4S, v20.4S, v9.4S // .............................................*................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v22.4S, v11.4S, v0.S[0] // ......................................*........................................................ + sqrdmulh v20.4S, v11.4S, v0.S[1] // .......................................*....................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v11.4S, v13.4S, v24.4S // ..........................................................*.................................... + mul v10.4S, v4.4S, v0.S[0] // ............................................*.................................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v23.4S, v27.4S, v0.S[3] // .....................................................*......................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v9.4S, v27.4S, v0.S[2] // ...................................................*........................................... + mls v22.4S, v20.4S, v8.S[0] // ...........................................*................................................... + mul v27.4S, v11.4S, v0.S[0] // ................................................................*.............................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v15.4S, v11.4S, v0.S[1] // .............................................................*................................. + add v18.4S, v13.4S, v24.4S // ............................................................*.................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v9.4S, v23.4S, v8.S[0] // ...........................................................*................................... + add v11.4S, v14.4S, v21.4S // .......................................................................*....................... + cmge v20.4S, v31.4S, v22.4S // ..................................................*............................................ + mul v24.4S, v18.4S, v25.4S // ...................................................................*........................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v21.4S, v11.4S, v26.4S // ..............................................................................*................ + // gap // ............................................................................................... + cmge v16.4S, v22.4S, v30.4S // .................................................*............................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + sub v13.4S, v9.4S, v7.4S // ....................................................................*.......................... + add v23.4S, v9.4S, v7.4S // .....................................................................*......................... + mul v9.4S, v11.4S, v25.4S // ...........................................................................*................... + sub v11.4S, v20.4S, v16.4S // ......................................................*........................................ + // gap // ............................................................................................... + // gap // ............................................................................................... + mul v7.4S, v13.4S, v0.S[0] // ..........................................................................*.................... + sqrdmulh v20.4S, v13.4S, v0.S[1] // ........................................................................*...................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v27.4S, v15.4S, v8.S[0] // ......................................................................*........................ + mul v13.4S, v23.4S, v25.4S // .........................................................................*..................... + sqrdmulh v15.4S, v23.4S, v26.4S // .............................................................................*................. + mls v22.4S, v11.4S, v8.4S // .........................................................*..................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v9.4S, v21.4S, v8.S[0] // ...................................................................................*........... + mls v7.4S, v20.4S, v8.S[0] // ................................................................................*.............. + sqrdmulh v11.4S, v4.4S, v0.S[1] // ................................................*.............................................. + // gap // ............................................................................................... + // gap // ............................................................................................... + sqrdmulh v18.4S, v18.4S, v26.4S // .................................................................*............................. + mls v13.4S, v15.4S, v8.S[0] // .................................................................................*............. + str q22, [x0, #768] // ...............................................................*............................... + // gap // ............................................................................................... + cmge v28.4S, v31.4S, v27.4S // ............................................................................*.................. + cmge v20.4S, v7.4S, v30.4S // .....................................................................................*......... + cmge v21.4S, v31.4S, v7.4S // ....................................................................................*.......... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v10.4S, v11.4S, v8.S[0] // ..................................................................*............................ + cmge v22.4S, v31.4S, v9.4S // ...........................................................................................*... + cmge v11.4S, v13.4S, v30.4S // .......................................................................................*....... + // gap // ............................................................................................... + // gap // ............................................................................................... + cmge v23.4S, v31.4S, v13.4S // ......................................................................................*........ + sub v20.4S, v21.4S, v20.4S // ........................................................................................*...... + // gap // ............................................................................................... + // gap // ............................................................................................... + mls v24.4S, v18.4S, v8.S[0] // ...............................................................................*............... + sub v11.4S, v23.4S, v11.4S // ..........................................................................................*.... + mul v23.4S, v17.4S, v25.4S // ..............................................................................................* + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + // gap // ............................................................................................... + cmge v29.4S, v31.4S, v10.4S // ..................................................................................*............ + mls v7.4S, v20.4S, v8.4S // ............................................................................................*.. + mls v13.4S, v11.4S, v8.4S // .............................................................................................*. + // gap // ............................................................................................... + // gap // ............................................................................................... + cmge v14.4S, v24.4S, v30.4S // .........................................................................................*..... + + // ---------------------------------------- new position ----------------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------------------- + // ldr q15, [x0, #384] // .........*..................................................................................... + // ldr q12, [x0, #768] // *.............................................................................................. + // ldr q20, [x0, #256] // ......*........................................................................................ + // ldr q19, [x0, #640] // ..*............................................................................................ + // ldr q4, [x0, #896] // .*............................................................................................. + // ldr q11, [x0, #128] // ....*.......................................................................................... + // ldr q5, [x0, #0] // .....*......................................................................................... + // sub v14.4S, v20.4S, v15.4S // ..................*............................................................................ + // ldr q17, [x0, #512] // ...*........................................................................................... + // add v28.4S, v12.4S, v4.4S // ........*...................................................................................... + // mul v21.4S, v14.4S, v2.S[0] // ...................................*........................................................... + // add v15.4S, v20.4S, v15.4S // .................*............................................................................. + // sub v20.4S, v5.4S, v11.4S // ................*.............................................................................. + // add v5.4S, v5.4S, v11.4S // ...................*........................................................................... + // sub v29.4S, v17.4S, v19.4S // ..........*.................................................................................... + // add v11.4S, v17.4S, v19.4S // ...........*................................................................................... + // sqrdmulh v19.4S, v20.4S, v1.S[3] // ..................................*............................................................ + // sub v6.4S, v5.4S, v15.4S // .......................*....................................................................... + // add v7.4S, v11.4S, v28.4S // .....................................*......................................................... + // sub v28.4S, v11.4S, v28.4S // .........................*..................................................................... + // mul v11.4S, v6.4S, v0.S[2] // ............................*.................................................................. + // sqrdmulh v17.4S, v6.4S, v0.S[3] // .............................*................................................................. + // sqrdmulh v6.4S, v28.4S, v1.S[1] // .................................*............................................................. + // mul v18.4S, v28.4S, v1.S[0] // ................................*.............................................................. + // add v27.4S, v5.4S, v15.4S // ........................*...................................................................... + // mul v20.4S, v20.4S, v1.S[2] // ......................*........................................................................ + // sqrdmulh v28.4S, v29.4S, v2.S[3] // ..............*................................................................................ + // mls v11.4S, v17.4S, v8.S[0] // .......................................*....................................................... + // mul v15.4S, v29.4S, v2.S[2] // ...............*............................................................................... + // mls v18.4S, v6.4S, v8.S[0] // ......................................*........................................................ + // sub v5.4S, v12.4S, v4.4S // .......*....................................................................................... + // sub v6.4S, v27.4S, v7.4S // ..........................................*.................................................... + // mls v20.4S, v19.4S, v8.S[0] // .........................................*..................................................... + // sqrdmulh v29.4S, v14.4S, v2.S[1] // ..........................*.................................................................... + // add v17.4S, v27.4S, v7.4S // ...........................................*................................................... + // sub v10.4S, v11.4S, v18.4S // .............................................*................................................. + // mul v7.4S, v5.4S, v3.S[0] // ............*.................................................................................. + // sqrdmulh v19.4S, v5.4S, v3.S[1] // .............*................................................................................. + // mul v27.4S, v10.4S, v0.S[0] // ................................................*.............................................. + // sqrdmulh v10.4S, v10.4S, v0.S[1] // .................................................*............................................. + // mls v21.4S, v29.4S, v8.S[0] // ........................................*...................................................... + // mls v15.4S, v28.4S, v8.S[0] // ....................*.......................................................................... + // mls v7.4S, v19.4S, v8.S[0] // .....................*......................................................................... + // mls v27.4S, v10.4S, v8.S[0] // ......................................................*........................................ + // mul v10.4S, v6.4S, v0.S[0] // ...................................................*........................................... + // sub v29.4S, v20.4S, v21.4S // ...............................................*............................................... + // add v22.4S, v20.4S, v21.4S // ..............................................*................................................ + // add v4.4S, v15.4S, v7.4S // ............................................*.................................................. + // sqrdmulh v5.4S, v6.4S, v0.S[1] // ............................................................................*.................. + // cmge v12.4S, v27.4S, v30.4S // ...............................................................*............................... + // cmge v19.4S, v31.4S, v27.4S // ............................................................*.................................. + // mul v21.4S, v29.4S, v0.S[2] // .....................................................*......................................... + // sub v28.4S, v15.4S, v7.4S // ...........................*................................................................... + // sqrdmulh v6.4S, v29.4S, v0.S[3] // ....................................................*.......................................... + // sub v29.4S, v19.4S, v12.4S // ...................................................................*........................... + // mul v20.4S, v28.4S, v1.S[0] // ...............................*............................................................... + // sqrdmulh v14.4S, v28.4S, v1.S[1] // ..............................*................................................................ + // mls v27.4S, v29.4S, v8.4S // .........................................................................*..................... + // sub v12.4S, v22.4S, v4.4S // ..................................................*............................................ + // mls v21.4S, v6.4S, v8.S[0] // ..........................................................*.................................... + // add v29.4S, v22.4S, v4.4S // .........................................................*..................................... + // sqrdmulh v22.4S, v12.4S, v0.S[1] // ........................................................*...................................... + // mls v20.4S, v14.4S, v8.S[0] // ....................................*.......................................................... + // str q27, [x0, #768] // ...............................................................................*............... + // mul v27.4S, v12.4S, v0.S[0] // .......................................................*....................................... + // sqrdmulh v6.4S, v29.4S, v26.4S // .............................................................................*................. + // mls v10.4S, v5.4S, v8.S[0] // ...................................................................................*........... + // mul v24.4S, v29.4S, v25.4S // .............................................................*................................. + // sub v29.4S, v21.4S, v20.4S // ................................................................*.............................. + // add v12.4S, v21.4S, v20.4S // .................................................................*............................. + // mls v27.4S, v22.4S, v8.S[0] // ......................................................................*........................ + // add v14.4S, v11.4S, v18.4S // ...........................................................*................................... + // sqrdmulh v19.4S, v29.4S, v0.S[1] // .....................................................................*......................... + // mul v13.4S, v12.4S, v25.4S // .......................................................................*....................... + // mul v7.4S, v29.4S, v0.S[0] // ....................................................................*.......................... + // mul v9.4S, v14.4S, v25.4S // ..................................................................*............................ + // cmge v28.4S, v31.4S, v27.4S // ................................................................................*.............. + // sqrdmulh v12.4S, v12.4S, v26.4S // ........................................................................*...................... + // sqrdmulh v20.4S, v14.4S, v26.4S // ..............................................................*................................ + // mls v24.4S, v6.4S, v8.S[0] // ........................................................................................*...... + // mls v7.4S, v19.4S, v8.S[0] // ...........................................................................*................... + // mls v13.4S, v12.4S, v8.S[0] // ..............................................................................*................ + // cmge v29.4S, v31.4S, v10.4S // ...........................................................................................*... + // mls v9.4S, v20.4S, v8.S[0] // ..........................................................................*.................... + // cmge v15.4S, v31.4S, v7.4S // ..................................................................................*............ + // cmge v21.4S, v7.4S, v30.4S // .................................................................................*............. + // cmge v5.4S, v31.4S, v13.4S // ......................................................................................*........ + // cmge v12.4S, v13.4S, v30.4S // .....................................................................................*......... + // sub v15.4S, v15.4S, v21.4S // .......................................................................................*....... + // cmge v14.4S, v24.4S, v30.4S // ..............................................................................................* + // sub v21.4S, v5.4S, v12.4S // .........................................................................................*..... + // cmge v22.4S, v31.4S, v9.4S // ....................................................................................*.......... + // mls v7.4S, v15.4S, v8.4S // ............................................................................................*.. + // mls v13.4S, v21.4S, v8.4S // .............................................................................................*. + // mul v23.4S, v17.4S, v25.4S // ..........................................................................................*.... + + sub count, count, #1 +layer123_start: + // Instructions: 120 + // Expected cycles: 52 + // Expected IPC: 2.31 + // + // Wall time: 122.00s + // User time: 122.00s + // + // -------------------------------------------------- original position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------------- + sqrdmulh v5.4S, v17.4S, v26.4S // .........................................................................................*.............................. + cmge v18.4S, v10.4S, v30.4S // .....................................................................*.................................................. + ldr q15, [x0, #400] // ...e.................................................................................................................... + ldr q12, [x0, #784] // ......e................................................................................................................. + ldr q20, [x0, #272] // ..e..................................................................................................................... + cmge v16.4S, v9.4S, v30.4S // .............................................................................................................*.......... + cmge v17.4S, v27.4S, v30.4S // .........................................................................*.............................................. + ldr q19, [x0, #656] // .....e.................................................................................................................. + ldr q4, [x0, #912] // .......e................................................................................................................ + sub v6.4S, v29.4S, v18.4S // ......................................................................*................................................. + cmge v21.4S, v31.4S, v24.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + mls v23.4S, v5.4S, v8.S[0] // ..........................................................................................*............................. + // gap // ........................................................................................................................ + sub v16.4S, v22.4S, v16.4S // ..............................................................................................................*......... + ldr q11, [x0, #144] // .e...................................................................................................................... + ldr q5, [x0, #16] // e....................................................................................................................... + sub v18.4S, v28.4S, v17.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + sub v22.4S, v21.4S, v14.4S // ..........................................................................................................*............. + sub v14.4S, v20.4S, v15.4S // .............e.......................................................................................................... + // gap // ........................................................................................................................ + mls v9.4S, v16.4S, v8.4S // ...............................................................................................................*........ + ldr q17, [x0, #528] // ....e................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v16.4S, v31.4S, v23.4S // ....................................................................................................*................... + add v28.4S, v12.4S, v4.4S // ........................e............................................................................................... + mul v21.4S, v14.4S, v2.S[0] // ...............e........................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v15.4S, v20.4S, v15.4S // ..............e......................................................................................................... + sub v20.4S, v5.4S, v11.4S // ........e............................................................................................................... + add v5.4S, v5.4S, v11.4S // .........e.............................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v29.4S, v17.4S, v19.4S // ..................e..................................................................................................... + add v11.4S, v17.4S, v19.4S // ...................e.................................................................................................... + // gap // ........................................................................................................................ + mls v10.4S, v6.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v20.4S, v1.S[3] // ...........e............................................................................................................ + // gap // ........................................................................................................................ + mls v27.4S, v18.4S, v8.4S // ...........................................................................*............................................ + // gap // ........................................................................................................................ + sub v6.4S, v5.4S, v15.4S // ............................e........................................................................................... + // gap // ........................................................................................................................ + str q7, [x0, #896] // .......................................................................................*................................ + add v7.4S, v11.4S, v28.4S // .......................................e................................................................................ + sub v28.4S, v11.4S, v28.4S // ......................................e................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v11.4S, v6.4S, v0.S[2] // ..............................e......................................................................................... + sqrdmulh v17.4S, v6.4S, v0.S[3] // ...............................e........................................................................................ + sqrdmulh v6.4S, v28.4S, v1.S[1] // .........................................e.............................................................................. + // gap // ........................................................................................................................ + str q27, [x0, #640] // .....................................................................................*.................................. + mul v18.4S, v28.4S, v1.S[0] // ........................................e............................................................................... + add v27.4S, v5.4S, v15.4S // .............................e.......................................................................................... + mul v20.4S, v20.4S, v1.S[2] // ..........e............................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v28.4S, v29.4S, v2.S[3] // .....................e.................................................................................................. + mls v11.4S, v17.4S, v8.S[0] // ................................e....................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v15.4S, v29.4S, v2.S[2] // ....................e................................................................................................... + mls v18.4S, v6.4S, v8.S[0] // ..........................................e............................................................................. + sub v5.4S, v12.4S, v4.4S // .......................e................................................................................................ + // gap // ........................................................................................................................ + sub v6.4S, v27.4S, v7.4S // ................................................e....................................................................... + // gap // ........................................................................................................................ + mls v20.4S, v19.4S, v8.S[0] // ............e........................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v29.4S, v14.4S, v2.S[1] // ................e....................................................................................................... + add v17.4S, v27.4S, v7.4S // .................................................e...................................................................... + // gap // ........................................................................................................................ + str q10, [x0, #512] // ....................................................................................*................................... + sub v10.4S, v11.4S, v18.4S // ..........................................................e............................................................. + mul v7.4S, v5.4S, v3.S[0] // .........................e.............................................................................................. + sqrdmulh v19.4S, v5.4S, v3.S[1] // ..........................e............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v27.4S, v10.4S, v0.S[0] // ............................................................e........................................................... + sqrdmulh v10.4S, v10.4S, v0.S[1] // .............................................................e.......................................................... + mls v21.4S, v29.4S, v8.S[0] // .................e...................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v15.4S, v28.4S, v8.S[0] // ......................e................................................................................................. + // gap // ........................................................................................................................ + mls v7.4S, v19.4S, v8.S[0] // ...........................e............................................................................................ + // gap // ........................................................................................................................ + mls v24.4S, v22.4S, v8.4S // ...........................................................................................................*............ + mls v27.4S, v10.4S, v8.S[0] // ..............................................................e......................................................... + mul v10.4S, v6.4S, v0.S[0] // ..................................................e..................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v29.4S, v20.4S, v21.4S // .................................e...................................................................................... + add v22.4S, v20.4S, v21.4S // ..................................e..................................................................................... + add v4.4S, v15.4S, v7.4S // ............................................e........................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v5.4S, v6.4S, v0.S[1] // ...................................................e.................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v12.4S, v27.4S, v30.4S // .............................................................................e.......................................... + cmge v19.4S, v31.4S, v27.4S // ............................................................................e........................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v21.4S, v29.4S, v0.S[2] // ...................................e.................................................................................... + sub v28.4S, v15.4S, v7.4S // ...........................................e............................................................................ + sqrdmulh v6.4S, v29.4S, v0.S[3] // ....................................e................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v29.4S, v19.4S, v12.4S // ..............................................................................e......................................... + mul v20.4S, v28.4S, v1.S[0] // .............................................e.......................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v28.4S, v1.S[1] // ..............................................e......................................................................... + mls v27.4S, v29.4S, v8.4S // ...............................................................................e........................................ + sub v12.4S, v22.4S, v4.4S // .....................................................e.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v6.4S, v8.S[0] // .....................................e.................................................................................. + add v29.4S, v22.4S, v4.4S // ......................................................e................................................................. + sqrdmulh v22.4S, v12.4S, v0.S[1] // ........................................................e............................................................... + // gap // ........................................................................................................................ + mls v20.4S, v14.4S, v8.S[0] // ...............................................e........................................................................ + // gap // ........................................................................................................................ + str q27, [x0, #784] // ......................................................................................e................................. + mul v27.4S, v12.4S, v0.S[0] // .......................................................e................................................................ + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v29.4S, v26.4S // ............................................................................................e........................... + mls v10.4S, v5.4S, v8.S[0] // ....................................................e................................................................... + // gap // ........................................................................................................................ + str q24, [x0, #128] // .....................................................................................................................*.. + mul v24.4S, v29.4S, v25.4S // ...........................................................................................e............................ + // gap // ........................................................................................................................ + sub v29.4S, v21.4S, v20.4S // ...............................................................e........................................................ + add v12.4S, v21.4S, v20.4S // ................................................................e....................................................... + // gap // ........................................................................................................................ + mls v27.4S, v22.4S, v8.S[0] // .........................................................e.............................................................. + str q13, [x0, #384] // .......................................................................................................................* + // gap // ........................................................................................................................ + add v14.4S, v11.4S, v18.4S // ...........................................................e............................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v29.4S, v0.S[1] // ..................................................................e..................................................... + mul v13.4S, v12.4S, v25.4S // .................................................................................................e...................... + str q9, [x0, #256] // ......................................................................................................................*. + mul v7.4S, v29.4S, v0.S[0] // .................................................................e...................................................... + mul v9.4S, v14.4S, v25.4S // ..............................................................................................e......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v28.4S, v31.4S, v27.4S // ........................................................................e............................................... + sqrdmulh v12.4S, v12.4S, v26.4S // ..................................................................................................e..................... + cmge v21.4S, v23.4S, v30.4S // .....................................................................................................*.................. + sqrdmulh v20.4S, v14.4S, v26.4S // ...............................................................................................e........................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v24.4S, v6.4S, v8.S[0] // .............................................................................................e.......................... + mls v7.4S, v19.4S, v8.S[0] // ...................................................................e.................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v12.4S, v8.S[0] // ...................................................................................................e.................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v29.4S, v31.4S, v10.4S // ....................................................................e................................................... + sub v18.4S, v16.4S, v21.4S // ......................................................................................................*................. + mls v9.4S, v20.4S, v8.S[0] // ................................................................................................e....................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v15.4S, v31.4S, v7.4S // ................................................................................e....................................... + cmge v21.4S, v7.4S, v30.4S // .................................................................................e...................................... + // gap // ........................................................................................................................ + cmge v5.4S, v31.4S, v13.4S // ................................................................................................................e....... + cmge v12.4S, v13.4S, v30.4S // .................................................................................................................e...... + // gap // ........................................................................................................................ + mls v23.4S, v18.4S, v8.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v15.4S, v15.4S, v21.4S // ..................................................................................e..................................... + cmge v14.4S, v24.4S, v30.4S // .........................................................................................................e.............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v21.4S, v5.4S, v12.4S // ..................................................................................................................e..... + cmge v22.4S, v31.4S, v9.4S // ............................................................................................................e........... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v7.4S, v15.4S, v8.4S // ...................................................................................e.................................... + mls v13.4S, v21.4S, v8.4S // ...................................................................................................................e.... + str q23, [x0], #(16) // ....................................................................................................................*... + mul v23.4S, v17.4S, v25.4S // ........................................................................................e............................... + // gap // ........................................................................................................................ + + // --------------------------------------------------------------------------------------------------------------- new position ---------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------- + // ldr q9, [x0, #0] // ............e.........................................................................................................'.............~........................................................................................................ + // ldr q10, [x0, #(1*(1024/8))] // ...........e..........................................................................................................'............~......................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ..e...................................................................................................................'...~.................................................................................................................. + // ldr q12, [x0, #(3*(1024/8))] // e.....................................................................................................................'.~.................................................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // .................e....................................................................................................'..................~................................................................................................... + // ldr q14, [x0, #(5*(1024/8))] // .....e................................................................................................................'......~............................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .e....................................................................................................................'..~................................................................................................................... + // ldr q16, [x0, #(7*(1024/8))] // ......e...............................................................................................................'.......~.............................................................................................................. + // sub v24.4s, v9.4s, v10.4s // ......................e...............................................................................................'.......................~.............................................................................................. + // add v9.4s, v9.4s, v10.4s // .......................e..............................................................................................'........................~............................................................................................. + // mul v10.4s, v24.4s, v1.s[2] // .......................................e..............................................................................'........................................~............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ...........................e..........................................................................................'............................~......................................................................................... + // mls v10.4s, v24.4s, v8.s[0] // ..............................................e.......................................................................'...............................................~...................................................................... + // sub v24.4s, v11.4s, v12.4s // ...............e......................................................................................................'................~..................................................................................................... + // add v11.4s, v11.4s, v12.4s // .....................e................................................................................................'......................~............................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ....................e.................................................................................................'.....................~................................................................................................ + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...............................................e......................................................................'................................................~..................................................................... + // mls v12.4s, v24.4s, v8.s[0] // .......................................................e..............................................................'........................................................~............................................................. + // sub v24.4s, v13.4s, v14.4s // ........................e.............................................................................................'.........................~............................................................................................ + // add v13.4s, v13.4s, v14.4s // .........................e............................................................................................'..........................~........................................................................................... + // mul v14.4s, v24.4s, v2.s[2] // ..........................................e...........................................................................'...........................................~.......................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................e.............................................................................'.........................................~............................................................................ + // mls v14.4s, v24.4s, v8.s[0] // ........................................................e.............................................................'.........................................................~............................................................ + // sub v24.4s, v15.4s, v16.4s // ............................................e.........................................................................'.............................................~........................................................................ + // add v15.4s, v15.4s, v16.4s // ...................e..................................................................................................'....................~................................................................................................. + // mul v16.4s, v24.4s, v3.s[0] // ...................................................e..................................................................'....................................................~................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ....................................................e.................................................................'.....................................................~................................................................ + // mls v16.4s, v24.4s, v8.s[0] // .........................................................e............................................................'..........................................................~........................................................... + // sub v24.4s, v9.4s, v11.4s // .............................e........................................................................................'..............................~....................................................................................... + // add v9.4s, v9.4s, v11.4s // ......................................e...............................................................................'.......................................~.............................................................................. + // mul v11.4s, v24.4s, v0.s[2] // .................................e....................................................................................'..................................~................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ..................................e...................................................................................'...................................~.................................................................................. + // mls v11.4s, v24.4s, v8.s[0] // .........................................e............................................................................'..........................................~........................................................................... + // sub v24.4s, v10.4s, v12.4s // .............................................................e........................................................'..............................................................~....................................................... + // add v10.4s, v10.4s, v12.4s // ..............................................................e.......................................................'...............................................................~...................................................... + // mul v12.4s, v24.4s, v0.s[2] // ...................................................................e..................................................'....................................................................~................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................e................................................'......................................................................~............................................... + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................e..........................................'............................................................................~......................................... + // sub v24.4s, v13.4s, v15.4s // ................................e.....................................................................................'.................................~.................................................................................... + // add v13.4s, v13.4s, v15.4s // ...............................e......................................................................................'................................~..................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // .....................................e................................................................................'......................................~............................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................e..................................................................................'....................................~................................................................................. + // mls v15.4s, v24.4s, v8.s[0] // ...........................................e..........................................................................'............................................~......................................................................... + // sub v24.4s, v14.4s, v16.4s // ....................................................................e.................................................'.....................................................................~................................................ + // add v14.4s, v14.4s, v16.4s // ...............................................................e......................................................'................................................................~..................................................... + // mul v16.4s, v24.4s, v1.s[0] // .......................................................................e..............................................'........................................................................~............................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................................................................e.............................................'.........................................................................~............................................ + // mls v16.4s, v24.4s, v8.s[0] // ..............................................................................e.......................................'...............................................................................~...................................... + // sub v24.4s, v9.4s, v13.4s // .............................................e........................................................................'..............................................~....................................................................... + // add v9.4s, v9.4s, v13.4s // ................................................e.....................................................................'.................................................~.................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ............................................................e.........................................................'.............................................................~........................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................................e.....................................................'.................................................................~.................................................... + // mls v13.4s, v24.4s, v8.s[0] // ..................................................................................e...................................'...................................................................................~.................................. + // sub v24.4s, v10.4s, v14.4s // ..........................................................................e...........................................'...........................................................................~.......................................... + // add v10.4s, v10.4s, v14.4s // ............................................................................e.........................................'.............................................................................~........................................ + // mul v14.4s, v24.4s, v0.s[0] // ................................................................................e.....................................'.................................................................................~.................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................................................................e........................................'..............................................................................~....................................... + // mls v14.4s, v24.4s, v8.s[0] // .......................................................................................e..............................'........................................................................................~............................. + // sub v24.4s, v11.4s, v15.4s // ..................................................e...................................................................'...................................................~.................................................................. + // add v11.4s, v11.4s, v15.4s // .........................................................................................e............................'..........................................................................................~........................... + // mul v15.4s, v24.4s, v0.s[0] // .....................................................e................................................................'......................................................~............................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................e...............................................................'.......................................................~.............................................................. + // mls v15.4s, v24.4s, v8.s[0] // ...........................................................e..........................................................'............................................................~......................................................... + // sub v24.4s, v12.4s, v16.4s // .....................................................................................e................................'......................................................................................~............................... + // add v12.4s, v12.4s, v16.4s // ......................................................................................e...............................'.......................................................................................~.............................. + // mul v16.4s, v24.4s, v0.s[0] // .............................................................................................e........................'..............................................................................................~....................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................................................................................e...........................'...........................................................................................~.......................... + // mls v16.4s, v24.4s, v8.s[0] // ....................................................................................................e.................'.....................................................................................................~................ + // cmge v27.4s, v31.4s, v13.4s // ......................................................................................................e...............'.......................................................................................................~.............. + // cmge v28.4s, v13.4s, v30.4s // ......................................................................................................................'*..................................................................................................................... + // sub v28.4s, v27.4s, v28.4s // .......~..............................................................................................................'........*............................................................................................................. + // mls v13.4s, v28.4s, v8.4s // ..........................~...........................................................................................'...........................*.......................................................................................... + // cmge v27.4s, v31.4s, v14.4s // ...............................................................................................e......................'................................................................................................~..................... + // cmge v28.4s, v14.4s, v30.4s // ....~.................................................................................................................'.....*................................................................................................................ + // sub v28.4s, v27.4s, v28.4s // .............~........................................................................................................'..............*....................................................................................................... + // mls v14.4s, v28.4s, v8.4s // ............................~.........................................................................................'.............................*........................................................................................ + // cmge v27.4s, v31.4s, v15.4s // ..................................................................e...................................................'...................................................................~.................................................. + // cmge v28.4s, v15.4s, v30.4s // .................................................................e....................................................'..................................................................~................................................... + // sub v28.4s, v27.4s, v28.4s // ......................................................................e...............................................'.......................................................................~.............................................. + // mls v15.4s, v28.4s, v8.4s // .........................................................................e............................................'..........................................................................~........................................... + // cmge v27.4s, v31.4s, v16.4s // .........................................................................................................e............'..........................................................................................................~........... + // cmge v28.4s, v16.4s, v30.4s // ..........................................................................................................e...........'...........................................................................................................~.......... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................................................e.......'...............................................................................................................~...... + // mls v16.4s, v28.4s, v8.4s // ..................................................................................................................e...'...................................................................................................................~.. + // str q13, [x0, #(4*(1024/8))] // .................................................~....................................................................'..................................................*................................................................... + // str q14, [x0, #(5*(1024/8))] // ....................................~.................................................................................'.....................................*................................................................................ + // str q15, [x0, #(6*(1024/8))] // ...............................................................................e......................................'................................................................................~..................................... + // str q16, [x0, #(7*(1024/8))] // ..............................~.......................................................................................'...............................*...................................................................................... + // mul v13.4s, v9.4s, v25.4s // .....................................................................................................................e'...................................................................................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ......................................................................................................................*...................................................................................................................... + // mls v13.4s, v9.4s, v8.s[0] // .........~............................................................................................................'..........*........................................................................................................... + // mul v14.4s, v10.4s, v25.4s // ....................................................................................e.................................'.....................................................................................~................................ + // sqrdmulh v10.4s, v10.4s, v26.4s // .................................................................................e....................................'..................................................................................~................................... + // mls v14.4s, v10.4s, v8.s[0] // ...................................................................................................e..................'....................................................................................................~................. + // mul v15.4s, v11.4s, v25.4s // ..............................................................................................e.......................'...............................................................................................~...................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ..................................................................................................e...................'...................................................................................................~.................. + // mls v15.4s, v11.4s, v8.s[0] // ........................................................................................................e.............'.........................................................................................................~............ + // mul v16.4s, v12.4s, v25.4s // ...........................................................................................e..........................'............................................................................................~......................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ................................................................................................e.....................'.................................................................................................~.................... + // mls v16.4s, v12.4s, v8.s[0] // .....................................................................................................e................'......................................................................................................~............... + // cmge v27.4s, v31.4s, v13.4s // ..................~...................................................................................................'...................*.................................................................................................. + // cmge v28.4s, v13.4s, v30.4s // .................................................................................................~....................'..................................................................................................*................... + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................~..............'........................................................................................................*............. + // mls v13.4s, v28.4s, v8.4s // .............................................................................................................~........'..............................................................................................................*....... + // cmge v27.4s, v31.4s, v14.4s // ........~.............................................................................................................'.........*............................................................................................................ + // cmge v28.4s, v14.4s, v30.4s // ...............................................................................................................e......'................................................................................................................~..... + // sub v28.4s, v27.4s, v28.4s // ..............~.......................................................................................................'...............*...................................................................................................... + // mls v14.4s, v28.4s, v8.4s // ..........................................................~...........................................................'...........................................................*.......................................................... + // cmge v27.4s, v31.4s, v15.4s // .................................................................................................................e....'..................................................................................................................~... + // cmge v28.4s, v15.4s, v30.4s // ...~..................................................................................................................'....*................................................................................................................. + // sub v28.4s, v27.4s, v28.4s // ..........~...........................................................................................................'...........*.......................................................................................................... + // mls v15.4s, v28.4s, v8.4s // ................~.....................................................................................................'.................*.................................................................................................... + // cmge v27.4s, v31.4s, v16.4s // ...........................................................................................................e..........'............................................................................................................~......... + // cmge v28.4s, v16.4s, v30.4s // ............................................................................................................e.........'.............................................................................................................~........ + // sub v28.4s, v27.4s, v28.4s // ................................................................................................................e.....'.................................................................................................................~.... + // mls v16.4s, v28.4s, v8.4s // ...................................................................................................................e..'....................................................................................................................~. + // str q13, [x0], #(16) // ....................................................................................................................~.'.....................................................................................................................* + // str q14, [x0, #(-16 + 1*(1024/8))] // ...................................................................................~..................................'....................................................................................*................................. + // str q15, [x0, #(-16 + 2*(1024/8))] // ............................................................................................~.........................'.............................................................................................*........................ + // str q16, [x0, #(-16 + 3*(1024/8))] // ........................................................................................~.............................'.........................................................................................*............................ + + sub count, count, #1 + cbnz count, layer123_start + // Instructions: 25 + // Expected cycles: 14 + // Expected IPC: 1.79 + // + // Wall time: 0.08s + // User time: 0.08s + // + // ----- original position -----> + // 0 25 + // |------------------------|---- + str q7, [x0, #896] // ..............*............... + sqrdmulh v7.4S, v17.4S, v26.4S // *............................. + cmge v5.4S, v9.4S, v30.4S // ..*........................... + // gap // .............................. + str q13, [x0, #384] // ...................*.......... + cmge v11.4S, v27.4S, v30.4S // ...*.......................... + // gap // .............................. + // gap // .............................. + sub v17.4S, v22.4S, v5.4S // .......*...................... + cmge v22.4S, v31.4S, v24.4S // .....*........................ + // gap // .............................. + // gap // .............................. + mls v23.4S, v7.4S, v8.S[0] // ......*....................... + sub v7.4S, v28.4S, v11.4S // ........*..................... + // gap // .............................. + // gap // .............................. + mls v9.4S, v17.4S, v8.4S // ..........*................... + sub v17.4S, v22.4S, v14.4S // .........*.................... + // gap // .............................. + // gap // .............................. + mls v27.4S, v7.4S, v8.4S // .............*................ + cmge v28.4S, v10.4S, v30.4S // .*............................ + // gap // .............................. + // gap // .............................. + cmge v11.4S, v31.4S, v23.4S // ...........*.................. + cmge v21.4S, v23.4S, v30.4S // .....................*........ + // gap // .............................. + // gap // .............................. + sub v28.4S, v29.4S, v28.4S // ....*......................... + mls v24.4S, v17.4S, v8.4S // .................*............ + str q9, [x0, #256] // ....................*......... + // gap // .............................. + str q27, [x0, #640] // ...............*.............. + sub v16.4S, v11.4S, v21.4S // ......................*....... + // gap // .............................. + // gap // .............................. + mls v10.4S, v28.4S, v8.4S // ............*................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + mls v23.4S, v16.4S, v8.4S // .......................*...... + str q24, [x0, #128] // ..................*........... + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q10, [x0, #512] // ................*............. + // gap // .............................. + // gap // .............................. + // gap // .............................. + str q23, [x0], #(16) // ........................*..... + // gap // .............................. + // gap // .............................. + // gap // .............................. + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // sqrdmulh v5.4S, v17.4S, v26.4S // .*............................. + // cmge v18.4S, v10.4S, v30.4S // ............*.................. + // cmge v16.4S, v9.4S, v30.4S // ..*............................ + // cmge v17.4S, v27.4S, v30.4S // ....*.......................... + // sub v6.4S, v29.4S, v18.4S // ...............*............... + // cmge v21.4S, v31.4S, v24.4S // ......*........................ + // mls v23.4S, v5.4S, v8.S[0] // .......*....................... + // sub v16.4S, v22.4S, v16.4S // .....*......................... + // sub v18.4S, v28.4S, v17.4S // ........*...................... + // sub v22.4S, v21.4S, v14.4S // ..........*.................... + // mls v9.4S, v16.4S, v8.4S // .........*..................... + // cmge v16.4S, v31.4S, v23.4S // .............*................. + // mls v10.4S, v6.4S, v8.4S // ....................*.......... + // mls v27.4S, v18.4S, v8.4S // ...........*................... + // str q7, [x0, #896] // *.............................. + // str q27, [x0, #640] // ..................*............ + // str q10, [x0, #512] // .......................*....... + // mls v24.4S, v22.4S, v8.4S // ................*.............. + // str q24, [x0, #128] // ......................*........ + // str q13, [x0, #384] // ...*........................... + // str q9, [x0, #256] // .................*............. + // cmge v21.4S, v23.4S, v30.4S // ..............*................ + // sub v18.4S, v16.4S, v21.4S // ...................*........... + // mls v23.4S, v18.4S, v8.4S // .....................*......... + // str q23, [x0], #(16) // ........................*...... + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_opt_a55.s b/examples/opt/aarch64/intt_dilithium_123_45678_opt_a55.s new file mode 100644 index 00000000..efd56dce --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_123_45678_opt_a55.s @@ -0,0 +1,2233 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, consts +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_opt_a55 + .global _intt_dilithium_123_45678_opt_a55 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_opt_a55: +_intt_dilithium_123_45678_opt_a55: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ldr q0, [x5], #(12*16) // .*........................................................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ldr q17, [x5, #-128] // ......*...................................................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ldr q31, [x5, #-160] // ..........*.................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1] // *............................................................................................................................................ + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ldr q18, [x5, #-144] // ..............*.............................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + sub v6.4S, v21.4S, v22.4S // .........*................................................................................................................................... + // gap // ............................................................................................................................................. + sub v14.4S, v19.4S, v20.4S // .............*............................................................................................................................... + // gap // ............................................................................................................................................. + ldr q11, [x5, #-112] // ...............*............................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + mul v17.4S, v6.4S, v17.4S // ............*................................................................................................................................ + // gap // ............................................................................................................................................. + mul v31.4S, v14.4S, v31.4S // .................*........................................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v18.4S, v14.4S, v18.4S // ................*............................................................................................................................ + // gap // ............................................................................................................................................. + sqrdmulh v6.4S, v6.4S, v11.4S // ..................*.......................................................................................................................... + // gap // ............................................................................................................................................. + add v19.4S, v19.4S, v20.4S // ...*......................................................................................................................................... + // gap // ............................................................................................................................................. + add v14.4S, v21.4S, v22.4S // ..*.......................................................................................................................................... + // gap // ............................................................................................................................................. + mls v31.4S, v18.4S, v8.S[0] // ...................*......................................................................................................................... + // gap // ............................................................................................................................................. + mls v17.4S, v6.4S, v8.S[0] // ....................*........................................................................................................................ + // gap // ............................................................................................................................................. + sub v18.4S, v19.4S, v14.4S // .....*....................................................................................................................................... + // gap // ............................................................................................................................................. + ldr q6, [x5, #-176] // ....*........................................................................................................................................ + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + sub v11.4S, v31.4S, v17.4S // .....................*....................................................................................................................... + // gap // ............................................................................................................................................. + add v19.4S, v19.4S, v14.4S // .......................................................*..................................................................................... + // gap // ............................................................................................................................................. + mul v14.4S, v18.4S, v0.4S // .......*..................................................................................................................................... + // gap // ............................................................................................................................................. + mul v0.4S, v11.4S, v0.4S // ...................................................*......................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v18.4S, v18.4S, v6.4S // ........*.................................................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v6.4S, v11.4S, v6.4S // .......................*..................................................................................................................... + // gap // ............................................................................................................................................. + add v17.4S, v31.4S, v17.4S // .........................................*................................................................................................... + // gap // ............................................................................................................................................. + ldr q31, [x5, #-64] // ......................*...................................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + mls v14.4S, v18.4S, v8.S[0] // ...........*................................................................................................................................. + // gap // ............................................................................................................................................. + mls v0.4S, v6.4S, v8.S[0] // ........................................................*.................................................................................... + // gap // ............................................................................................................................................. + trn1 v18.4S, v19.4S, v17.4S // ..........................................................*.................................................................................. + // gap // ............................................................................................................................................. + trn2 v17.4S, v19.4S, v17.4S // ...........................................................*................................................................................. + // gap // ............................................................................................................................................. + ldr q19, [x5, #-48] // ........................*.................................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + trn1 v6.4S, v14.4S, v0.4S // ............................................................*................................................................................ + // gap // ............................................................................................................................................. + trn2 v0.4S, v14.4S, v0.4S // .............................................................*............................................................................... + // gap // ............................................................................................................................................. + ldr q14, [x5, #-32] // ..........................*.................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + trn2 v11.2D, v18.2D, v6.2D // ...............................................................*............................................................................. + // gap // ............................................................................................................................................. + trn2 v28.2D, v17.2D, v0.2D // ................................................................*............................................................................ + // gap // ............................................................................................................................................. + trn1 v18.2D, v18.2D, v6.2D // ............................................................................*................................................................ + // gap // ............................................................................................................................................. + sub v6.4S, v11.4S, v28.4S // ..................................................................*.......................................................................... + // gap // ............................................................................................................................................. + trn1 v0.2D, v17.2D, v0.2D // .............................................................................*............................................................... + // gap // ............................................................................................................................................. + add v17.4S, v11.4S, v28.4S // ....................................................................................................*........................................ + // gap // ............................................................................................................................................. + sub v11.4S, v18.4S, v0.4S // ................................................................................*............................................................ + // gap // ............................................................................................................................................. + add v0.4S, v18.4S, v0.4S // ...................................................................................................*......................................... + // gap // ............................................................................................................................................. + ldr q18, [x5, #-16] // ............................*................................................................................................................ + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + add v28.4S, v0.4S, v17.4S // .......................................................................................................*..................................... + // gap // ............................................................................................................................................. + sub v0.4S, v0.4S, v17.4S // .......................................................................................................................*..................... + // gap // ............................................................................................................................................. + ldr q17, [x5, #-80] // .......................................*..................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + srshr v26.4S, v28.4S, #23 // ..........................................................................................................*.................................. + // gap // ............................................................................................................................................. + ldr q29, [x5, #-96] // ............................................*................................................................................................ + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + mls v28.4S, v26.4S, v8.4S // .............................................................................................................*............................... + // gap // ............................................................................................................................................. + ldr q26, [x4, #16] // .....................................................*....................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ldr q9, [x4, #32] // ......................................................*...................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + mul v4.4S, v11.4S, v26.S[2] // ...................................................................................*......................................................... + // gap // ............................................................................................................................................. + sqrdmulh v11.4S, v11.4S, v26.S[3] // ....................................................................................*........................................................ + // gap // ............................................................................................................................................. + mul v20.4S, v6.4S, v9.S[0] // .....................................................................*....................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v6.4S, v6.4S, v9.S[1] // .................................................................................*........................................................... + // gap // ............................................................................................................................................. + ldr q25, [x4, #48] // .......................................................................*..................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x2] // .........................*................................................................................................................... + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + mls v20.4S, v6.4S, v8.S[0] // .....................................................................................*....................................................... + // gap // ............................................................................................................................................. + mls v4.4S, v11.4S, v8.S[0] // ........................................................................................*.................................................... + // gap // ............................................................................................................................................. + sub v6.4S, v21.4S, v22.4S // .............................*............................................................................................................... + // gap // ............................................................................................................................................. + sub v11.4S, v23.4S, v24.4S // ...........................*................................................................................................................. + // gap // ............................................................................................................................................. + add v13.4S, v21.4S, v22.4S // ..................................*.......................................................................................................... + // gap // ............................................................................................................................................. + mul v31.4S, v6.4S, v31.4S // .................................*........................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v19.4S, v6.4S, v19.4S // ................................*............................................................................................................ + // gap // ............................................................................................................................................. + mul v6.4S, v11.4S, v14.4S // ..............................*.............................................................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v18.4S, v11.4S, v18.4S // ...............................*............................................................................................................. + // gap // ............................................................................................................................................. + add v14.4S, v23.4S, v24.4S // ...................................*......................................................................................................... + // gap // ............................................................................................................................................. + mls v31.4S, v19.4S, v8.S[0] // .....................................*....................................................................................................... + // gap // ............................................................................................................................................. + add v27.4S, v4.4S, v20.4S // ............................................................................................*................................................ + // gap // ............................................................................................................................................. + mls v6.4S, v18.4S, v8.S[0] // ....................................*........................................................................................................ + // gap // ............................................................................................................................................. + sub v19.4S, v13.4S, v14.4S // ......................................*...................................................................................................... + // gap // ............................................................................................................................................. + add v18.4S, v13.4S, v14.4S // .............................................*............................................................................................... + // gap // ............................................................................................................................................. + srshr v14.4S, v27.4S, #23 // ...............................................................................................*............................................. + // gap // ............................................................................................................................................. + sub v11.4S, v31.4S, v6.4S // ........................................*.................................................................................................... + // gap // ............................................................................................................................................. + sqrdmulh v13.4S, v19.4S, v17.4S // ..........................................*.................................................................................................. + // gap // ............................................................................................................................................. + add v31.4S, v31.4S, v6.4S // ..............................................*.............................................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v17.4S, v11.4S, v17.4S // ...........................................*................................................................................................. + // gap // ............................................................................................................................................. + mul v6.4S, v11.4S, v29.4S // ...............................................*............................................................................................. + // gap // ............................................................................................................................................. + mul v19.4S, v19.4S, v29.4S // ................................................*............................................................................................ + // gap // ............................................................................................................................................. + trn2 v11.4S, v18.4S, v31.4S // .................................................*........................................................................................... + // gap // ............................................................................................................................................. + trn1 v31.4S, v18.4S, v31.4S // ..................................................*.......................................................................................... + // gap // ............................................................................................................................................. + mls v6.4S, v17.4S, v8.S[0] // .........................................................*................................................................................... + // gap // ............................................................................................................................................. + mls v19.4S, v13.4S, v8.S[0] // ....................................................*........................................................................................ + // gap // ............................................................................................................................................. + mls v27.4S, v14.4S, v8.4S // ..................................................................................................*.......................................... + // gap // ............................................................................................................................................. + sub v17.4S, v4.4S, v20.4S // ..............................................................................................................*.............................. + // gap // ............................................................................................................................................. + ldr q29, [x4], #64 // ...............................................................................................................*............................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + // gap // ............................................................................................................................................. + trn1 v18.4S, v19.4S, v6.4S // ..............................................................*.............................................................................. + // gap // ............................................................................................................................................. + trn2 v19.4S, v19.4S, v6.4S // .................................................................*........................................................................... + // gap // ............................................................................................................................................. + mul v6.4S, v17.4S, v29.S[2] // ..................................................................................................................*.......................... + // gap // ............................................................................................................................................. + trn2 v14.2D, v31.2D, v18.2D // ...................................................................*......................................................................... + // gap // ............................................................................................................................................. + trn2 v4.2D, v11.2D, v19.2D // ....................................................................*........................................................................ + // gap // ............................................................................................................................................. + trn1 v19.2D, v11.2D, v19.2D // ........................................................................*.................................................................... + // gap // ............................................................................................................................................. + sub v11.4S, v14.4S, v4.4S // ......................................................................*...................................................................... + // gap // ............................................................................................................................................. + mul v20.4S, v0.4S, v29.S[2] // .............................................................................................................................*............... + // gap // ............................................................................................................................................. + sqrdmulh v0.4S, v0.4S, v29.S[3] // ..............................................................................................................................*.............. + // gap // ............................................................................................................................................. + trn1 v31.2D, v31.2D, v18.2D // .........................................................................*................................................................... + // gap // ............................................................................................................................................. + mul v18.4S, v11.4S, v25.S[0] // ..........................................................................*.................................................................. + // gap // ............................................................................................................................................. + sub v13.4S, v31.4S, v19.4S // ...........................................................................*................................................................. + // gap // ............................................................................................................................................. + sqrdmulh v11.4S, v11.4S, v25.S[1] // ..................................................................................*.......................................................... + // gap // ............................................................................................................................................. + add v31.4S, v31.4S, v19.4S // .........................................................................................*................................................... + // gap // ............................................................................................................................................. + mul v19.4S, v13.4S, v9.S[2] // ..............................................................................*.............................................................. + // gap // ............................................................................................................................................. + sqrdmulh v9.4S, v13.4S, v9.S[3] // ...............................................................................*............................................................. + // gap // ............................................................................................................................................. + mls v18.4S, v11.4S, v8.S[0] // .......................................................................................*..................................................... + // gap // ............................................................................................................................................. + add v14.4S, v14.4S, v4.4S // ..........................................................................................*.................................................. + // gap // ............................................................................................................................................. + sqrdmulh v17.4S, v17.4S, v29.S[3] // ....................................................................................................................*........................ + // gap // ............................................................................................................................................. + mls v19.4S, v9.4S, v8.S[0] // ......................................................................................*...................................................... + // gap // ............................................................................................................................................. + add v11.4S, v31.4S, v14.4S // ................................................................................................*............................................ + // gap // ............................................................................................................................................. + sub v31.4S, v31.4S, v14.4S // .............................................................................................*............................................... + // gap // ............................................................................................................................................. + mls v6.4S, v17.4S, v8.S[0] // ........................................................................................................................*.................... + // gap // ............................................................................................................................................. + srshr v17.4S, v11.4S, #23 // ........................................................................................................*.................................... + // gap // ............................................................................................................................................. + sub v14.4S, v19.4S, v18.4S // .....................................................................................................*....................................... + // gap // ............................................................................................................................................. + sqrdmulh v9.4S, v31.4S, v26.S[1] // ...........................................................................................................................*................. + // gap // ............................................................................................................................................. + mls v11.4S, v17.4S, v8.4S // ............................................................................................................*................................ + // gap // ............................................................................................................................................. + mul v17.4S, v14.4S, v26.S[0] // ...........................................................................................................*................................. + // gap // ............................................................................................................................................. + sqrdmulh v14.4S, v14.4S, v26.S[1] // .................................................................................................................*........................... + // gap // ............................................................................................................................................. + mul v31.4S, v31.4S, v26.S[0] // .................................................................................................................................*........... + // gap // ............................................................................................................................................. + add v26.4S, v28.4S, v11.4S // ................................................................................................................*............................ + // gap // ............................................................................................................................................. + sub v11.4S, v28.4S, v11.4S // ......................................................................................................................*...................... + // gap // ............................................................................................................................................. + add v4.4S, v19.4S, v18.4S // ...........................................................................................*................................................. + // gap // ............................................................................................................................................. + str q26, [x1], #(16*4) // ...................................................................................................................*......................... + // gap // ............................................................................................................................................. + mls v17.4S, v14.4S, v8.S[0] // .....................................................................................................................*....................... + // gap // ............................................................................................................................................. + srshr v19.4S, v4.4S, #23 // ..............................................................................................*.............................................. + // gap // ............................................................................................................................................. + sqrdmulh v18.4S, v11.4S, v29.S[1] // .........................................................................................................................*................... + // gap // ............................................................................................................................................. + mul v14.4S, v11.4S, v29.S[0] // ..........................................................................................................................*.................. + // gap // ............................................................................................................................................. + sub v11.4S, v6.4S, v17.4S // ............................................................................................................................*................ + // gap // ............................................................................................................................................. + mls v4.4S, v19.4S, v8.4S // .................................................................................................*........................................... + // gap // ............................................................................................................................................. + add v17.4S, v6.4S, v17.4S // .......................................................................................................................................*..... + // gap // ............................................................................................................................................. + sqrdmulh v19.4S, v11.4S, v29.S[1] // ................................................................................................................................*............ + // gap // ............................................................................................................................................. + mul v6.4S, v11.4S, v29.S[0] // ...............................................................................................................................*............. + // gap // ............................................................................................................................................. + mls v14.4S, v18.4S, v8.S[0] // ..................................................................................................................................*.......... + // gap // ............................................................................................................................................. + mls v20.4S, v0.4S, v8.S[0] // ...................................................................................................................................*......... + // gap // ............................................................................................................................................. + str q17, [x1, #-16] // ..........................................................................................................................................*.. + // gap // ............................................................................................................................................. + mls v6.4S, v19.4S, v8.S[0] // ....................................................................................................................................*........ + // gap // ............................................................................................................................................. + mls v31.4S, v9.4S, v8.S[0] // .....................................................................................................................................*....... + // gap // ............................................................................................................................................. + str q14, [x2], #(16*4) // ......................................................................................................................................*...... + // gap // ............................................................................................................................................. + add v0.4S, v27.4S, v4.4S // ......................................................................................................*...................................... + // gap // ............................................................................................................................................. + str q6, [x2, #-16] // ........................................................................................................................................*.... + // gap // ............................................................................................................................................. + add v17.4S, v20.4S, v31.4S // .........................................................................................................................................*... + // gap // ............................................................................................................................................. + str q0, [x1, #-48] // .........................................................................................................*................................... + // gap // ............................................................................................................................................. + sub v10.4S, v20.4S, v31.4S // ...........................................................................................................................................*. + // gap // ............................................................................................................................................. + str q17, [x1, #-32] // ............................................................................................................................................* + // gap // ............................................................................................................................................. + + // original source code + // ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1] // ...*......................................................................................................................................... + // ldr q2, [x5], #(12*16) // *............................................................................................................................................ + // add v0.4S, v18.4S, v19.4S // .............*............................................................................................................................... + // add v21.4S, v16.4S, v17.4S // ............*................................................................................................................................ + // ldr q27, [x5, #-176] // .................*........................................................................................................................... + // sub v24.4S, v21.4S, v0.4S // ................*............................................................................................................................ + // ldr q25, [x5, #-128] // .*........................................................................................................................................... + // mul v28.4S, v24.4S, v2.4S // ....................*........................................................................................................................ + // sqrdmulh v23.4S, v24.4S, v27.4S // ......................*...................................................................................................................... + // sub v12.4S, v18.4S, v19.4S // .....*....................................................................................................................................... + // ldr q3, [x5, #-160] // ..*.......................................................................................................................................... + // mls v28.4S, v23.4S, v8.S[0] // ..........................*.................................................................................................................. + // mul v1.4S, v12.4S, v25.4S // ........*.................................................................................................................................... + // sub v13.4S, v16.4S, v17.4S // ......*...................................................................................................................................... + // ldr q22, [x5, #-144] // ....*........................................................................................................................................ + // ldr q5, [x5, #-112] // .......*..................................................................................................................................... + // sqrdmulh v20.4S, v13.4S, v22.4S // ..........*.................................................................................................................................. + // mul v11.4S, v13.4S, v3.4S // .........*................................................................................................................................... + // sqrdmulh v30.4S, v12.4S, v5.4S // ...........*................................................................................................................................. + // mls v11.4S, v20.4S, v8.S[0] // ..............*.............................................................................................................................. + // mls v1.4S, v30.4S, v8.S[0] // ...............*............................................................................................................................. + // sub v31.4S, v11.4S, v1.4S // ..................*.......................................................................................................................... + // ldr q12, [x5, #-64] // .........................*................................................................................................................... + // sqrdmulh v24.4S, v31.4S, v27.4S // .......................*..................................................................................................................... + // ldr q4, [x5, #-48] // ..............................*.............................................................................................................. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ........................................................*.................................................................................... + // ldr q17, [x5, #-32] // .................................*........................................................................................................... + // sub v20.4S, v15.4S, v16.4S // ............................................................*................................................................................ + // ldr q5, [x5, #-16] // ..........................................*.................................................................................................. + // sub v30.4S, v13.4S, v14.4S // ...........................................................*................................................................................. + // mul v3.4S, v20.4S, v17.4S // ................................................................*............................................................................ + // sqrdmulh v27.4S, v20.4S, v5.4S // .................................................................*........................................................................... + // sqrdmulh v29.4S, v30.4S, v4.4S // ...............................................................*............................................................................. + // mul v12.4S, v30.4S, v12.4S // ..............................................................*.............................................................................. + // add v14.4S, v13.4S, v14.4S // .............................................................*............................................................................... + // add v6.4S, v15.4S, v16.4S // ..................................................................*.......................................................................... + // mls v3.4S, v27.4S, v8.S[0] // .....................................................................*....................................................................... + // mls v12.4S, v29.4S, v8.S[0] // ...................................................................*......................................................................... + // sub v20.4S, v14.4S, v6.4S // ......................................................................*...................................................................... + // ldr q26, [x5, #-80] // .............................................*............................................................................................... + // sub v13.4S, v12.4S, v3.4S // .........................................................................*................................................................... + // add v22.4S, v11.4S, v1.4S // ........................*.................................................................................................................... + // sqrdmulh v4.4S, v20.4S, v26.4S // ..........................................................................*.................................................................. + // sqrdmulh v7.4S, v13.4S, v26.4S // ............................................................................*................................................................ + // ldr q19, [x5, #-96] // ...............................................*............................................................................................. + // add v25.4S, v14.4S, v6.4S // .......................................................................*..................................................................... + // add v1.4S, v12.4S, v3.4S // ...........................................................................*................................................................. + // mul v5.4S, v13.4S, v19.4S // .............................................................................*............................................................... + // mul v29.4S, v20.4S, v19.4S // ..............................................................................*.............................................................. + // trn2 v18.4S, v25.4S, v1.4S // ...............................................................................*............................................................. + // trn1 v12.4S, v25.4S, v1.4S // ................................................................................*............................................................ + // mul v30.4S, v31.4S, v2.4S // .....................*....................................................................................................................... + // mls v29.4S, v4.4S, v8.S[0] // ..................................................................................*.......................................................... + // ldr q10, [x4, #16] // .................................................*........................................................................................... + // ldr q15, [x4, #32] // ..................................................*.......................................................................................... + // add v27.4S, v21.4S, v0.4S // ...................*......................................................................................................................... + // mls v30.4S, v24.4S, v8.S[0] // ...........................*................................................................................................................. + // mls v5.4S, v7.4S, v8.S[0] // .................................................................................*........................................................... + // trn1 v17.4S, v27.4S, v22.4S // ............................*................................................................................................................ + // trn2 v2.4S, v27.4S, v22.4S // .............................*............................................................................................................... + // trn1 v11.4S, v28.4S, v30.4S // ...............................*............................................................................................................. + // trn2 v26.4S, v28.4S, v30.4S // ................................*............................................................................................................ + // trn1 v4.4S, v29.4S, v5.4S // ......................................................................................*...................................................... + // trn2 v7.2D, v17.2D, v11.2D // ..................................*.......................................................................................................... + // trn2 v3.2D, v2.2D, v26.2D // ...................................*......................................................................................................... + // trn2 v0.4S, v29.4S, v5.4S // .......................................................................................*..................................................... + // sub v20.4S, v7.4S, v3.4S // .....................................*....................................................................................................... + // trn2 v14.2D, v12.2D, v4.2D // .........................................................................................*................................................... + // trn2 v27.2D, v18.2D, v0.2D // ..........................................................................................*.................................................. + // mul v31.4S, v20.4S, v15.S[0] // .....................................................*....................................................................................... + // sub v1.4S, v14.4S, v27.4S // ............................................................................................*................................................ + // ldr q23, [x4, #48] // .......................................................*..................................................................................... + // trn1 v19.2D, v18.2D, v0.2D // ...........................................................................................*................................................. + // trn1 v25.2D, v12.2D, v4.2D // ...............................................................................................*............................................. + // mul v29.4S, v1.4S, v23.S[0] // ................................................................................................*............................................ + // sub v5.4S, v25.4S, v19.4S // .................................................................................................*........................................... + // trn1 v6.2D, v17.2D, v11.2D // ....................................*........................................................................................................ + // trn1 v9.2D, v2.2D, v26.2D // ......................................*...................................................................................................... + // mul v24.4S, v5.4S, v15.S[2] // ....................................................................................................*........................................ + // sqrdmulh v11.4S, v5.4S, v15.S[3] // .....................................................................................................*....................................... + // sub v16.4S, v6.4S, v9.4S // ........................................*.................................................................................................... + // sqrdmulh v13.4S, v20.4S, v15.S[1] // ......................................................*...................................................................................... + // sqrdmulh v30.4S, v1.4S, v23.S[1] // ..................................................................................................*.......................................... + // mul v22.4S, v16.4S, v10.S[2] // ...................................................*......................................................................................... + // sqrdmulh v17.4S, v16.4S, v10.S[3] // ....................................................*........................................................................................ + // mls v31.4S, v13.4S, v8.S[0] // .........................................................*................................................................................... + // mls v24.4S, v11.4S, v8.S[0] // .........................................................................................................*................................... + // mls v29.4S, v30.4S, v8.S[0] // ......................................................................................................*...................................... + // mls v22.4S, v17.4S, v8.S[0] // ..........................................................*.................................................................................. + // add v17.4S, v25.4S, v19.4S // ...................................................................................................*......................................... + // add v18.4S, v14.4S, v27.4S // .......................................................................................................*..................................... + // add v4.4S, v24.4S, v29.4S // ......................................................................................................................*...................... + // add v27.4S, v22.4S, v31.4S // ....................................................................*........................................................................ + // sub v5.4S, v17.4S, v18.4S // ...........................................................................................................*................................. + // srshr v25.4S, v4.4S, #23 // .........................................................................................................................*................... + // srshr v26.4S, v27.4S, #23 // ........................................................................*.................................................................... + // add v21.4S, v17.4S, v18.4S // ..........................................................................................................*.................................. + // mls v4.4S, v25.4S, v8.4S // .............................................................................................................................*............... + // mls v27.4S, v26.4S, v8.4S // ...................................................................................*......................................................... + // add v28.4S, v6.4S, v9.4S // .........................................*................................................................................................... + // add v23.4S, v7.4S, v3.4S // .......................................*..................................................................................................... + // sub v13.4S, v24.4S, v29.4S // ..............................................................................................................*.............................. + // add v18.4S, v27.4S, v4.4S // .......................................................................................................................................*..... + // add v2.4S, v28.4S, v23.4S // ...........................................*................................................................................................. + // srshr v26.4S, v21.4S, #23 // .............................................................................................................*............................... + // str q18, [x1, #16] // ..........................................................................................................................................*.. + // srshr v12.4S, v2.4S, #23 // ..............................................*.............................................................................................. + // mul v19.4S, v13.4S, v10.S[0] // .................................................................................................................*........................... + // mls v21.4S, v26.4S, v8.4S // ................................................................................................................*............................ + // mls v2.4S, v12.4S, v8.4S // ................................................*............................................................................................ + // sub v6.4S, v22.4S, v31.4S // ....................................................................................*........................................................ + // ldr q29, [x4], #64 // .....................................................................................*....................................................... + // add v14.4S, v2.4S, v21.4S // ....................................................................................................................*........................ + // sqrdmulh v11.4S, v13.4S, v10.S[1] // ..................................................................................................................*.......................... + // mul v9.4S, v6.4S, v29.S[2] // ........................................................................................*.................................................... + // str q14, [x1], #(16*4) // .......................................................................................................................*..................... + // sqrdmulh v1.4S, v6.4S, v29.S[3] // ........................................................................................................*.................................... + // mls v19.4S, v11.4S, v8.S[0] // ........................................................................................................................*.................... + // sub v7.4S, v2.4S, v21.4S // .....................................................................................................................*....................... + // sub v14.4S, v28.4S, v23.4S // ............................................*................................................................................................ + // mls v9.4S, v1.4S, v8.S[0] // ............................................................................................................*................................ + // sqrdmulh v1.4S, v7.4S, v29.S[1] // ..........................................................................................................................*.................. + // mul v23.4S, v7.4S, v29.S[0] // ...........................................................................................................................*................. + // sqrdmulh v0.4S, v5.4S, v10.S[1] // ...............................................................................................................*............................. + // sub v15.4S, v9.4S, v19.4S // ............................................................................................................................*................ + // mul v12.4S, v14.4S, v29.S[2] // .............................................................................................*............................................... + // sqrdmulh v17.4S, v14.4S, v29.S[3] // ..............................................................................................*.............................................. + // mul v30.4S, v15.4S, v29.S[0] // ................................................................................................................................*............ + // sqrdmulh v31.4S, v15.4S, v29.S[1] // ...............................................................................................................................*............. + // mul v16.4S, v5.4S, v10.S[0] // ...................................................................................................................*......................... + // mls v23.4S, v1.4S, v8.S[0] // .................................................................................................................................*........... + // mls v12.4S, v17.4S, v8.S[0] // ..................................................................................................................................*.......... + // mls v30.4S, v31.4S, v8.S[0] // ....................................................................................................................................*........ + // mls v16.4S, v0.4S, v8.S[0] // .....................................................................................................................................*....... + // str q23, [x2], #(16*4) // ......................................................................................................................................*...... + // add v24.4S, v9.4S, v19.4S // ..............................................................................................................................*.............. + // str q30, [x2, #-16] // ........................................................................................................................................*.... + // add v22.4S, v12.4S, v16.4S // .........................................................................................................................................*... + // str q24, [x1, #-16] // ...................................................................................................................................*......... + // sub v10.4S, v12.4S, v16.4S // ...........................................................................................................................................*. + // str q22, [x1, #-32] // ............................................................................................................................................* + + sub count, count, #1 +layer45678_start: + sub v6.4S, v27.4S, v4.4S // ...............................................................................................................................*........................ + add x1, x1, #64 // ......................................................................................................................................................*. + ld4 {v16.4S, v17.4S, v18.4S, v19.4S}, [x1] // e....................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q2, [x5], #(12*16) // ..e..................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v0.4S, v18.4S, v19.4S // ..............e......................................................................................................................................... + // gap // ........................................................................................................................................................ + add v21.4S, v16.4S, v17.4S // .........e.............................................................................................................................................. + // gap // ........................................................................................................................................................ + ldr q27, [x5, #-176] // ...e.................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v24.4S, v21.4S, v0.4S // ..................e..................................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q25, [x5, #-128] // ......e................................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v28.4S, v24.4S, v2.4S // ....................e................................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v23.4S, v24.4S, v27.4S // .....................e.................................................................................................................................. + // gap // ........................................................................................................................................................ + sub v12.4S, v18.4S, v19.4S // .............e.......................................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q3, [x5, #-160] // ....e................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v28.4S, v23.4S, v8.S[0] // ......................e................................................................................................................................. + // gap // ........................................................................................................................................................ + mul v1.4S, v12.4S, v25.4S // ...............e........................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v13.4S, v16.4S, v17.4S // ........e............................................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q22, [x5, #-144] // .....e.................................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q5, [x5, #-112] // .......e................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v20.4S, v13.4S, v22.4S // ...........e............................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v11.4S, v13.4S, v3.4S // ..........e............................................................................................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v12.4S, v5.4S // ................e....................................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v12.4S, v6.4S, v29.S[1] // ..................................................................................................................................*..................... + // gap // ........................................................................................................................................................ + mul v6.4S, v6.4S, v29.S[0] // .................................................................................................................................*...................... + // gap // ........................................................................................................................................................ + mls v11.4S, v20.4S, v8.S[0] // ............e........................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v1.4S, v30.4S, v8.S[0] // .................e...................................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v9.4S, v10.4S, v29.S[1] // .......................................................................................................................................*................ + // gap // ........................................................................................................................................................ + mls v6.4S, v12.4S, v8.S[0] // ...................................................................................................................................*.................... + // gap // ........................................................................................................................................................ + mul v13.4S, v10.4S, v29.S[0] // ......................................................................................................................................*................. + // gap // ........................................................................................................................................................ + sub v31.4S, v11.4S, v1.4S // .......................e................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q12, [x5, #-64] // ..............................e......................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v24.4S, v31.4S, v27.4S // ..........................e............................................................................................................................. + // gap // ........................................................................................................................................................ + mls v13.4S, v9.4S, v8.S[0] // ........................................................................................................................................*............... + // gap // ........................................................................................................................................................ + str q6, [x2, #-48] // ...................................................................................................................................................*.... + // gap // ........................................................................................................................................................ + ldr q4, [x5, #-48] // ...............................e........................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q13, [x2, #-32] // ....................................................................................................................................................*... + add x2, x2, #64 // .......................................................................................................................................................* + ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .e...................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q17, [x5, #-32] // ................................e....................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v20.4S, v15.4S, v16.4S // .......................................e................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q5, [x5, #-16] // .................................e...................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v30.4S, v13.4S, v14.4S // ..................................e..................................................................................................................... + // gap // ........................................................................................................................................................ + mul v3.4S, v20.4S, v17.4S // .........................................e.............................................................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v27.4S, v20.4S, v5.4S // ..........................................e............................................................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v29.4S, v30.4S, v4.4S // .....................................e.................................................................................................................. + // gap // ........................................................................................................................................................ + mul v12.4S, v30.4S, v12.4S // ....................................e................................................................................................................... + // gap // ........................................................................................................................................................ + add v14.4S, v13.4S, v14.4S // ...................................e.................................................................................................................... + // gap // ........................................................................................................................................................ + add v6.4S, v15.4S, v16.4S // ........................................e............................................................................................................... + // gap // ........................................................................................................................................................ + mls v3.4S, v27.4S, v8.S[0] // ...........................................e............................................................................................................ + // gap // ........................................................................................................................................................ + mls v12.4S, v29.4S, v8.S[0] // ......................................e................................................................................................................. + // gap // ........................................................................................................................................................ + sub v20.4S, v14.4S, v6.4S // ............................................e........................................................................................................... + // gap // ........................................................................................................................................................ + ldr q26, [x5, #-80] // .............................e.......................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v13.4S, v12.4S, v3.4S // .................................................e...................................................................................................... + // gap // ........................................................................................................................................................ + add v22.4S, v11.4S, v1.4S // ........................e............................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v4.4S, v20.4S, v26.4S // ...............................................e........................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v7.4S, v13.4S, v26.4S // ....................................................e................................................................................................... + // gap // ........................................................................................................................................................ + ldr q19, [x5, #-96] // ............................e........................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v25.4S, v14.4S, v6.4S // .............................................e.......................................................................................................... + // gap // ........................................................................................................................................................ + add v1.4S, v12.4S, v3.4S // ..................................................e..................................................................................................... + // gap // ........................................................................................................................................................ + mul v5.4S, v13.4S, v19.4S // ...................................................e.................................................................................................... + // gap // ........................................................................................................................................................ + mul v29.4S, v20.4S, v19.4S // ..............................................e......................................................................................................... + // gap // ........................................................................................................................................................ + trn2 v18.4S, v25.4S, v1.4S // ...............................................................e........................................................................................ + // gap // ........................................................................................................................................................ + trn1 v12.4S, v25.4S, v1.4S // ..............................................................e......................................................................................... + // gap // ........................................................................................................................................................ + mul v30.4S, v31.4S, v2.4S // .........................e.............................................................................................................................. + // gap // ........................................................................................................................................................ + mls v29.4S, v4.4S, v8.S[0] // ................................................e....................................................................................................... + // gap // ........................................................................................................................................................ + ldr q10, [x4, #16] // .......................................................................e................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q15, [x4, #32] // ........................................................................e............................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v27.4S, v21.4S, v0.4S // ...................e.................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v30.4S, v24.4S, v8.S[0] // ...........................e............................................................................................................................ + // gap // ........................................................................................................................................................ + mls v5.4S, v7.4S, v8.S[0] // .....................................................e.................................................................................................. + // gap // ........................................................................................................................................................ + trn1 v17.4S, v27.4S, v22.4S // ......................................................e................................................................................................. + // gap // ........................................................................................................................................................ + trn2 v2.4S, v27.4S, v22.4S // .......................................................e................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v11.4S, v28.4S, v30.4S // ........................................................e............................................................................................... + // gap // ........................................................................................................................................................ + trn2 v26.4S, v28.4S, v30.4S // .........................................................e.............................................................................................. + // gap // ........................................................................................................................................................ + trn1 v4.4S, v29.4S, v5.4S // ................................................................e....................................................................................... + // gap // ........................................................................................................................................................ + trn2 v7.2D, v17.2D, v11.2D // ..........................................................e............................................................................................. + // gap // ........................................................................................................................................................ + trn2 v3.2D, v2.2D, v26.2D // ...........................................................e............................................................................................ + // gap // ........................................................................................................................................................ + trn2 v0.4S, v29.4S, v5.4S // .................................................................e...................................................................................... + // gap // ........................................................................................................................................................ + sub v20.4S, v7.4S, v3.4S // ...............................................................................e........................................................................ + // gap // ........................................................................................................................................................ + trn2 v14.2D, v12.2D, v4.2D // ..................................................................e..................................................................................... + // gap // ........................................................................................................................................................ + trn2 v27.2D, v18.2D, v0.2D // ...................................................................e.................................................................................... + // gap // ........................................................................................................................................................ + mul v31.4S, v20.4S, v15.S[0] // .................................................................................e...................................................................... + // gap // ........................................................................................................................................................ + sub v1.4S, v14.4S, v27.4S // .........................................................................................e.............................................................. + // gap // ........................................................................................................................................................ + ldr q23, [x4, #48] // .........................................................................e.............................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v19.2D, v18.2D, v0.2D // .....................................................................e.................................................................................. + // gap // ........................................................................................................................................................ + trn1 v25.2D, v12.2D, v4.2D // ....................................................................e................................................................................... + // gap // ........................................................................................................................................................ + mul v29.4S, v1.4S, v23.S[0] // ...........................................................................................e............................................................ + // gap // ........................................................................................................................................................ + sub v5.4S, v25.4S, v19.4S // ....................................................................................e................................................................... + // gap // ........................................................................................................................................................ + trn1 v6.2D, v17.2D, v11.2D // ............................................................e........................................................................................... + // gap // ........................................................................................................................................................ + trn1 v9.2D, v2.2D, v26.2D // .............................................................e.......................................................................................... + // gap // ........................................................................................................................................................ + mul v24.4S, v5.4S, v15.S[2] // ......................................................................................e................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v11.4S, v5.4S, v15.S[3] // .......................................................................................e................................................................ + // gap // ........................................................................................................................................................ + sub v16.4S, v6.4S, v9.4S // ..........................................................................e............................................................................. + // gap // ........................................................................................................................................................ + sqrdmulh v13.4S, v20.4S, v15.S[1] // ..................................................................................e..................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v1.4S, v23.S[1] // ............................................................................................e........................................................... + // gap // ........................................................................................................................................................ + mul v22.4S, v16.4S, v10.S[2] // ............................................................................e........................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v17.4S, v16.4S, v10.S[3] // .............................................................................e.......................................................................... + // gap // ........................................................................................................................................................ + mls v31.4S, v13.4S, v8.S[0] // ...................................................................................e.................................................................... + // gap // ........................................................................................................................................................ + mls v24.4S, v11.4S, v8.S[0] // ........................................................................................e............................................................... + // gap // ........................................................................................................................................................ + mls v29.4S, v30.4S, v8.S[0] // .............................................................................................e.......................................................... + // gap // ........................................................................................................................................................ + mls v22.4S, v17.4S, v8.S[0] // ..............................................................................e......................................................................... + // gap // ........................................................................................................................................................ + add v17.4S, v25.4S, v19.4S // .....................................................................................e.................................................................. + // gap // ........................................................................................................................................................ + add v18.4S, v14.4S, v27.4S // ..........................................................................................e............................................................. + // gap // ........................................................................................................................................................ + add v4.4S, v24.4S, v29.4S // ..............................................................................................................e......................................... + // gap // ........................................................................................................................................................ + add v27.4S, v22.4S, v31.4S // ....................................................................................................e................................................... + // gap // ........................................................................................................................................................ + sub v5.4S, v17.4S, v18.4S // ........................................................................................................e............................................... + // gap // ........................................................................................................................................................ + srshr v25.4S, v4.4S, #23 // ........................................................................................................................e............................... + // gap // ........................................................................................................................................................ + srshr v26.4S, v27.4S, #23 // ....................................................................................................................e................................... + // gap // ........................................................................................................................................................ + add v21.4S, v17.4S, v18.4S // .........................................................................................................e.............................................. + // gap // ........................................................................................................................................................ + mls v4.4S, v25.4S, v8.4S // .........................................................................................................................e.............................. + // gap // ........................................................................................................................................................ + mls v27.4S, v26.4S, v8.4S // .....................................................................................................................e.................................. + // gap // ........................................................................................................................................................ + add v28.4S, v6.4S, v9.4S // ...........................................................................e............................................................................ + // gap // ........................................................................................................................................................ + add v23.4S, v7.4S, v3.4S // ................................................................................e....................................................................... + // gap // ........................................................................................................................................................ + sub v13.4S, v24.4S, v29.4S // .............................................................................................................e.......................................... + // gap // ........................................................................................................................................................ + add v18.4S, v27.4S, v4.4S // ................................................................................................................................e....................... + // gap // ........................................................................................................................................................ + add v2.4S, v28.4S, v23.4S // ...............................................................................................e........................................................ + // gap // ........................................................................................................................................................ + srshr v26.4S, v21.4S, #23 // ......................................................................................................................e................................. + // gap // ........................................................................................................................................................ + str q18, [x1, #16] // ...............................................................................................................................................e........ + // gap // ........................................................................................................................................................ + srshr v12.4S, v2.4S, #23 // ..................................................................................................................e..................................... + // gap // ........................................................................................................................................................ + mul v19.4S, v13.4S, v10.S[0] // ...............................................................................................................e........................................ + // gap // ........................................................................................................................................................ + mls v21.4S, v26.4S, v8.4S // .......................................................................................................................e................................ + // gap // ........................................................................................................................................................ + mls v2.4S, v12.4S, v8.4S // ...................................................................................................................e.................................... + // gap // ........................................................................................................................................................ + sub v6.4S, v22.4S, v31.4S // ...................................................................................................e.................................................... + // gap // ........................................................................................................................................................ + ldr q29, [x4], #64 // ......................................................................e................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v14.4S, v2.4S, v21.4S // ...........................................................................................................................e............................ + // gap // ........................................................................................................................................................ + sqrdmulh v11.4S, v13.4S, v10.S[1] // ................................................................................................................e....................................... + // gap // ........................................................................................................................................................ + mul v9.4S, v6.4S, v29.S[2] // .....................................................................................................e.................................................. + // gap // ........................................................................................................................................................ + str q14, [x1], #(16*4) // ..............................................................................................................................................e......... + // gap // ........................................................................................................................................................ + sqrdmulh v1.4S, v6.4S, v29.S[3] // ......................................................................................................e................................................. + // gap // ........................................................................................................................................................ + mls v19.4S, v11.4S, v8.S[0] // .................................................................................................................e...................................... + // gap // ........................................................................................................................................................ + sub v7.4S, v2.4S, v21.4S // ..........................................................................................................................e............................. + // gap // ........................................................................................................................................................ + sub v14.4S, v28.4S, v23.4S // ..............................................................................................e......................................................... + // gap // ........................................................................................................................................................ + mls v9.4S, v1.4S, v8.S[0] // .......................................................................................................e................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v1.4S, v7.4S, v29.S[1] // .............................................................................................................................e.......................... + // gap // ........................................................................................................................................................ + mul v23.4S, v7.4S, v29.S[0] // ............................................................................................................................e........................... + // gap // ........................................................................................................................................................ + sqrdmulh v0.4S, v5.4S, v10.S[1] // ...........................................................................................................e............................................ + // gap // ........................................................................................................................................................ + sub v15.4S, v9.4S, v19.4S // .........................................................................................................................................e.............. + // gap // ........................................................................................................................................................ + mul v12.4S, v14.4S, v29.S[2] // ................................................................................................e....................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v17.4S, v14.4S, v29.S[3] // .................................................................................................e...................................................... + // gap // ........................................................................................................................................................ + mul v30.4S, v15.4S, v29.S[0] // ...........................................................................................................................................e............ + // gap // ........................................................................................................................................................ + sqrdmulh v31.4S, v15.4S, v29.S[1] // ............................................................................................................................................e........... + // gap // ........................................................................................................................................................ + mul v16.4S, v5.4S, v10.S[0] // ..........................................................................................................e............................................. + // gap // ........................................................................................................................................................ + mls v23.4S, v1.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + mls v12.4S, v17.4S, v8.S[0] // ..................................................................................................e..................................................... + // gap // ........................................................................................................................................................ + mls v30.4S, v31.4S, v8.S[0] // .............................................................................................................................................e.......... + // gap // ........................................................................................................................................................ + mls v16.4S, v0.4S, v8.S[0] // ............................................................................................................e........................................... + // gap // ........................................................................................................................................................ + str q23, [x2], #(16*4) // ..................................................................................................................................................e..... + // gap // ........................................................................................................................................................ + add v24.4S, v9.4S, v19.4S // ..........................................................................................................................................e............. + // gap // ........................................................................................................................................................ + str q30, [x2, #-16] // .....................................................................................................................................................e.. + // gap // ........................................................................................................................................................ + add v22.4S, v12.4S, v16.4S // .....................................................................................................................................e.................. + // gap // ........................................................................................................................................................ + str q24, [x1, #-16] // .................................................................................................................................................e...... + // gap // ........................................................................................................................................................ + sub v10.4S, v12.4S, v16.4S // ....................................................................................................................................e................... + // gap // ........................................................................................................................................................ + str q22, [x1, #-32] // ................................................................................................................................................e....... + // gap // ........................................................................................................................................................ + + // original source code + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // e.....................................................................................................................................................|.e................................. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ..................................e...................................................................................................................|................................... + // ldr q0, [x5], #(12*16) // .e....................................................................................................................................................|..e................................ + // ldr q4, [x5, #(-12*16 + 1*16)] // ....e.................................................................................................................................................|.....e............................. + // ldr q1, [x5, #(-12*16 + 2*16)] // ..........e...........................................................................................................................................|...........e....................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ..............e.......................................................................................................................................|...............e................... + // ldr q2, [x5, #(-12*16 + 4*16)] // ......e...............................................................................................................................................|.......e........................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ...............e......................................................................................................................................|................e.................. + // sub v24.4s, v9.4s, v10.4s // .............e........................................................................................................................................|..............e.................... + // add v9.4s, v9.4s, v10.4s // ...e..................................................................................................................................................|....e.............................. + // mul v10.4s, v24.4s, v1.4s // .................e....................................................................................................................................|..................e................ + // sqrdmulh v24.4s, v24.4s, v5.4s // ................e.....................................................................................................................................|.................e................. + // mls v10.4s, v24.4s, v8.s[0] // .....................e................................................................................................................................|......................e............ + // sub v24.4s, v11.4s, v12.4s // .........e............................................................................................................................................|..........e........................ + // add v11.4s, v11.4s, v12.4s // ..e...................................................................................................................................................|...e............................... + // mul v12.4s, v24.4s, v2.4s // ............e.........................................................................................................................................|.............e..................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..................e...................................................................................................................................|...................e............... + // mls v12.4s, v24.4s, v8.s[0] // ......................e...............................................................................................................................|.......................e........... + // sub v24.4s, v9.4s, v11.4s // .....e................................................................................................................................................|......e............................ + // add v9.4s, v9.4s, v11.4s // ................................................................e.....................................................................................|................................... + // mul v11.4s, v24.4s, v0.4s // .......e..............................................................................................................................................|........e.......................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ........e.............................................................................................................................................|.........e......................... + // mls v11.4s, v24.4s, v8.s[0] // ...........e..........................................................................................................................................|............e...................... + // sub v24.4s, v10.4s, v12.4s // ..........................e...........................................................................................................................|...........................e....... + // add v10.4s, v10.4s, v12.4s // ..................................................e...................................................................................................|................................... + // mul v12.4s, v24.4s, v0.4s // ............................................................e.........................................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ............................e.........................................................................................................................|.............................e..... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................e....................................................................................|................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // .....................................................e................................................................................................|................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ................................................e.....................................................................................................|................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ...........................e..........................................................................................................................|............................e...... + // ldr q5, [x5, #(-12*16 + 9*16)] // ...............................e......................................................................................................................|................................e.. + // ldr q2, [x5, #(-12*16 + 10*16)] // ...................................e..................................................................................................................|................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // .....................................e................................................................................................................|................................... + // sub v24.4s, v13.4s, v14.4s // ......................................e...............................................................................................................|................................... + // add v13.4s, v13.4s, v14.4s // ...........................................e..........................................................................................................|................................... + // mul v14.4s, v24.4s, v1.4s // ..........................................e...........................................................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .........................................e............................................................................................................|................................... + // mls v14.4s, v24.4s, v8.s[0] // ..............................................e.......................................................................................................|................................... + // sub v24.4s, v15.4s, v16.4s // ....................................e.................................................................................................................|................................... + // add v15.4s, v15.4s, v16.4s // ............................................e.........................................................................................................|................................... + // mul v16.4s, v24.4s, v2.4s // .......................................e..............................................................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ........................................e.............................................................................................................|................................... + // mls v16.4s, v24.4s, v8.s[0] // .............................................e........................................................................................................|................................... + // sub v24.4s, v13.4s, v15.4s // ...............................................e......................................................................................................|................................... + // add v13.4s, v13.4s, v15.4s // ......................................................e...............................................................................................|................................... + // mul v15.4s, v24.4s, v0.4s // .........................................................e............................................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................e..................................................................................................|................................... + // mls v15.4s, v24.4s, v8.s[0] // .............................................................e........................................................................................|................................... + // sub v24.4s, v14.4s, v16.4s // .................................................e....................................................................................................|................................... + // add v14.4s, v14.4s, v16.4s // .......................................................e..............................................................................................|................................... + // mul v16.4s, v24.4s, v0.4s // ........................................................e.............................................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................e.................................................................................................|................................... + // mls v16.4s, v24.4s, v8.s[0] // ..................................................................e...................................................................................|................................... + // trn1 v25.4s, v9.4s, v10.4s // ...................................................................e..................................................................................|................................... + // trn2 v26.4s, v9.4s, v10.4s // ....................................................................e.................................................................................|................................... + // trn1 v27.4s, v11.4s, v12.4s // .....................................................................e................................................................................|................................... + // trn2 v28.4s, v11.4s, v12.4s // ......................................................................e...............................................................................|................................... + // trn2 v11.2d, v25.2d, v27.2d // ........................................................................e.............................................................................|................................... + // trn2 v12.2d, v26.2d, v28.2d // .........................................................................e............................................................................|................................... + // trn1 v9.2d, v25.2d, v27.2d // .....................................................................................e................................................................|................................... + // trn1 v10.2d, v26.2d, v28.2d // ......................................................................................e...............................................................|................................... + // trn1 v25.4s, v13.4s, v14.4s // ...........................................................e..........................................................................................|................................... + // trn2 v26.4s, v13.4s, v14.4s // ..........................................................e...........................................................................................|................................... + // trn1 v27.4s, v15.4s, v16.4s // .......................................................................e..............................................................................|................................... + // trn2 v28.4s, v15.4s, v16.4s // ..........................................................................e...........................................................................|................................... + // trn2 v15.2d, v25.2d, v27.2d // ............................................................................e.........................................................................|................................... + // trn2 v16.2d, v26.2d, v28.2d // .............................................................................e........................................................................|................................... + // trn1 v13.2d, v25.2d, v27.2d // ..................................................................................e...................................................................|................................... + // trn1 v14.2d, v26.2d, v28.2d // .................................................................................e....................................................................|................................... + // ldr q0, [x4], #64 // ........................................................................................................................e.............................|................................... + // ldr q1, [x4, #(-64 + 16)] // ..............................................................e.......................................................................................|................................... + // ldr q2, [x4, #(-64 + 32)] // ...............................................................e......................................................................................|................................... + // ldr q3, [x4, #(-64 + 48)] // ................................................................................e.....................................................................|................................... + // sub v24.4s, v9.4s, v10.4s // .........................................................................................e............................................................|................................... + // add v9.4s, v9.4s, v10.4s // ............................................................................................................e.........................................|................................... + // mul v10.4s, v24.4s, v1.s[2] // ............................................................................................e.........................................................|................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .............................................................................................e........................................................|................................... + // mls v10.4s, v24.4s, v8.s[0] // .................................................................................................e....................................................|................................... + // sub v24.4s, v11.4s, v12.4s // ...........................................................................e..........................................................................|................................... + // add v11.4s, v11.4s, v12.4s // .............................................................................................................e........................................|................................... + // mul v12.4s, v24.4s, v2.s[0] // ..............................................................................e.......................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........................................................................................e...........................................................|................................... + // mls v12.4s, v24.4s, v8.s[0] // ..............................................................................................e.......................................................|................................... + // sub v24.4s, v13.4s, v14.4s // ....................................................................................e.................................................................|................................... + // add v13.4s, v13.4s, v14.4s // ..................................................................................................e...................................................|................................... + // mul v14.4s, v24.4s, v2.s[2] // .......................................................................................e..............................................................|................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ........................................................................................e.............................................................|................................... + // mls v14.4s, v24.4s, v8.s[0] // ...............................................................................................e......................................................|................................... + // sub v24.4s, v15.4s, v16.4s // ...............................................................................e......................................................................|................................... + // add v15.4s, v15.4s, v16.4s // ...................................................................................................e..................................................|................................... + // mul v16.4s, v24.4s, v3.s[0] // ...................................................................................e..................................................................|................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...........................................................................................e..........................................................|................................... + // mls v16.4s, v24.4s, v8.s[0] // ................................................................................................e.....................................................|................................... + // sub v24.4s, v9.4s, v11.4s // ................................................................................................................................e.....................|................................... + // add v9.4s, v9.4s, v11.4s // ................................................................................................................e.....................................|................................... + // mul v11.4s, v24.4s, v0.s[2] // ......................................................................................................................................e...............|................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................................e..............|................................... + // mls v11.4s, v24.4s, v8.s[0] // ............................................................................................................................................e.........|................................... + // sub v24.4s, v10.4s, v12.4s // .......................................................................................................................e..............................|................................... + // add v10.4s, v10.4s, v12.4s // .....................................................................................................e................................................|................................... + // mul v12.4s, v24.4s, v0.s[2] // ...........................................................................................................................e..........................|................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................................................................................................e........................|................................... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................................................................................e....................|................................... + // sub v24.4s, v13.4s, v15.4s // ......................................................................................................e...............................................|................................... + // add v13.4s, v13.4s, v15.4s // .........................................................................................................e............................................|................................... + // mul v15.4s, v24.4s, v1.s[0] // ..........................................................................................................................................e...........|................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ....................................................................................................................................e.................|................................... + // mls v15.4s, v24.4s, v8.s[0] // ..............................................................................................................................................e.......|................................... + // sub v24.4s, v14.4s, v16.4s // ..............................................................................................................e.......................................|................................... + // add v14.4s, v14.4s, v16.4s // ....................................................................................................e.................................................|................................... + // mul v16.4s, v24.4s, v1.s[0] // ....................................................................................................................e.................................|................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................e...........................|................................... + // mls v16.4s, v24.4s, v8.s[0] // ..............................................................................................................................e.......................|................................... + // srshr v24.4S, v9.4S, #23 // ...................................................................................................................e..................................|................................... + // mls v9.4s, v24.4s, v8.4s // ......................................................................................................................e...............................|................................... + // srshr v24.4S, v10.4S, #23 // ........................................................................................................e.............................................|................................... + // mls v10.4s, v24.4s, v8.4s // ...........................................................................................................e..........................................|................................... + // srshr v24.4S, v13.4S, #23 // .................................................................................................................e....................................|................................... + // mls v13.4s, v24.4s, v8.4s // .....................................................................................................................e................................|................................... + // srshr v24.4S, v14.4S, #23 // .......................................................................................................e..............................................|................................... + // mls v14.4s, v24.4s, v8.4s // ..........................................................................................................e...........................................|................................... + // sub v24.4s, v9.4s, v13.4s // ...............................................................................................................................e......................|................................... + // add v9.4s, v9.4s, v13.4s // .........................................................................................................................e............................|................................... + // mul v13.4s, v24.4s, v0.s[0] // ...................................................................................................................................e..................|................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..................................................................................................................................e...................|................................... + // mls v13.4s, v24.4s, v8.s[0] // ...........................................................................................................................................e..........|................................... + // sub v24.4s, v10.4s, v14.4s // ......................................................................................................................................................*................................... + // add v10.4s, v10.4s, v14.4s // ...............................................................................................................e......................................|................................... + // mul v14.4s, v24.4s, v0.s[0] // ....................*.................................................................................................................................|.....................*............. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................*..................................................................................................................................|....................*.............. + // mls v14.4s, v24.4s, v8.s[0] // ........................*.............................................................................................................................|.........................*......... + // sub v24.4s, v11.4s, v15.4s // ....................................................................................................................................................e.|................................... + // add v11.4s, v11.4s, v15.4s // ..................................................................................................................................................e...|................................... + // mul v15.4s, v24.4s, v0.s[0] // .........................*............................................................................................................................|..........................*........ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................*..............................................................................................................................|........................*.......... + // mls v15.4s, v24.4s, v8.s[0] // .............................*........................................................................................................................|..............................*.... + // sub v24.4s, v12.4s, v16.4s // .....................................................................................................................................e................|................................... + // add v12.4s, v12.4s, v16.4s // ................................................................................................................................................e.....|................................... + // mul v16.4s, v24.4s, v0.s[0] // ........................................................................................................................................e.............|................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........................................................................................................................................e............|................................... + // mls v16.4s, v24.4s, v8.s[0] // .............................................................................................................................................e........|................................... + // str q9, [x1], #(16*4) // ............................................................................................................................e.........................|................................... + // str q10, [x1, #(-16*4 + 1*16)] // ..................................................................................................................e...................................|................................... + // str q11, [x1, #(-16*4 + 2*16)] // .....................................................................................................................................................e|................................... + // str q12, [x1, #(-16*4 + 3*16)] // ...................................................................................................................................................e..|................................... + // str q13, [x2], #(16*4) // ...............................................................................................................................................e......|................................... + // str q14, [x2, #(-16*4 + 1*16)] // ..............................*.......................................................................................................................|...............................*... + // str q15, [x2, #(-16*4 + 2*16)] // ................................*.....................................................................................................................|.................................*. + // str q16, [x2, #(-16*4 + 3*16)] // .................................................................................................................................................e....|................................... + // add x1, x1, #64 // ......................................................................................................................................................|*.................................. + // add x2, x2, #64 // .................................*....................................................................................................................|..................................* + + sub count, count, #1 + cbnz count, layer45678_start + sub v4.4S, v27.4S, v4.4S // *.......... + add x1, x1, #64 // .*......... + sqrdmulh v17.4S, v10.4S, v29.S[1] // ....*...... + // gap // ........... + mul v0.4S, v10.4S, v29.S[0] // ......*.... + // gap // ........... + sqrdmulh v31.4S, v4.4S, v29.S[1] // ..*........ + // gap // ........... + mul v29.4S, v4.4S, v29.S[0] // ...*....... + // gap // ........... + // gap // ........... + // gap // ........... + mls v0.4S, v17.4S, v8.S[0] // .......*... + // gap // ........... + // gap // ........... + // gap // ........... + mls v29.4S, v31.4S, v8.S[0] // .....*..... + // gap // ........... + // gap // ........... + // gap // ........... + str q0, [x2, #-32] // .........*. + // gap // ........... + // gap // ........... + // gap // ........... + str q29, [x2, #-48] // ........*.. + add x2, x2, #64 // ..........* + + // original source code + // sub v6.4S, v27.4S, v4.4S // *.......... + // add x1, x1, #64 // .*......... + // sqrdmulh v12.4S, v6.4S, v29.S[1] // ....*...... + // mul v6.4S, v6.4S, v29.S[0] // .....*..... + // sqrdmulh v9.4S, v10.4S, v29.S[1] // ..*........ + // mls v6.4S, v12.4S, v8.S[0] // .......*... + // mul v13.4S, v10.4S, v29.S[0] // ...*....... + // mls v13.4S, v9.4S, v8.S[0] // ......*.... + // str q6, [x2, #-48] // .........*. + // str q13, [x2, #-32] // ........*.. + // add x2, x2, #64 // ..........* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q11, [x0, #256] // ..*......... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q20, [x0, #384] // ........*... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q4, [x0, #896] // .......*.... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q28, [x0, #512] // ...*........ + // gap // ............ + // gap // ............ + // gap // ............ + ldr q9, [x0, #768] // .....*...... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q29, [x0, #640] // ....*....... + // gap // ............ + // gap // ............ + // gap // ............ + add v15.4S, v11.4S, v20.4S // ..........*. + // gap // ............ + add v22.4S, v9.4S, v4.4S // .........*.. + // gap // ............ + add v13.4S, v28.4S, v29.4S // ......*..... + // gap // ............ + ldr q18, [x0, #0] // *........... + // gap // ............ + // gap // ............ + // gap // ............ + add v23.4S, v13.4S, v22.4S // ...........* + // gap // ............ + ldr q14, [x0, #128] // .*.......... + // gap // ............ + + // original source code + // ldr q18, [x0, #0] // .........*.. + // ldr q14, [x0, #128] // ...........* + // ldr q11, [x0, #256] // *........... + // ldr q28, [x0, #512] // ...*........ + // ldr q29, [x0, #640] // .....*...... + // ldr q9, [x0, #768] // ....*....... + // add v13.4S, v28.4S, v29.4S // ........*... + // ldr q4, [x0, #896] // ..*......... + // ldr q20, [x0, #384] // .*.......... + // add v22.4S, v9.4S, v4.4S // .......*.... + // add v15.4S, v11.4S, v20.4S // ......*..... + // add v23.4S, v13.4S, v22.4S // ..........*. + + sub count, count, #1 +layer123_start: + sub v17.4S, v18.4S, v14.4S // ........*............................................................................................................... + // gap // ........................................................................................................................ + add v19.4S, v18.4S, v14.4S // .........*.............................................................................................................. + // gap // ........................................................................................................................ + sub v18.4S, v11.4S, v20.4S // .............*.......................................................................................................... + // gap // ........................................................................................................................ + mul v6.4S, v17.4S, v1.S[2] // ..........*............................................................................................................. + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v17.4S, v1.S[3] // ...........*............................................................................................................ + // gap // ........................................................................................................................ + sub v14.4S, v19.4S, v15.4S // ............................*........................................................................................... + // gap // ........................................................................................................................ + add v19.4S, v19.4S, v15.4S // .............................*.......................................................................................... + // gap // ........................................................................................................................ + mul v11.4S, v18.4S, v2.S[0] // ...............*........................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v18.4S, v18.4S, v2.S[1] // ................*....................................................................................................... + // gap // ........................................................................................................................ + mls v6.4S, v17.4S, v8.S[0] // ............*........................................................................................................... + // gap // ........................................................................................................................ + sub v17.4S, v28.4S, v29.4S // ..................*..................................................................................................... + // gap // ........................................................................................................................ + mul v28.4S, v14.4S, v0.S[2] // ..............................*......................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v14.4S, v0.S[3] // ...............................*........................................................................................ + // gap // ........................................................................................................................ + sub v29.4S, v19.4S, v23.4S // ................................................*....................................................................... + // gap // ........................................................................................................................ + add v19.4S, v19.4S, v23.4S // .................................................*...................................................................... + // gap // ........................................................................................................................ + mls v11.4S, v18.4S, v8.S[0] // .................*...................................................................................................... + // gap // ........................................................................................................................ + mul v18.4S, v17.4S, v2.S[2] // ....................*................................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v17.4S, v2.S[3] // .....................*.................................................................................................. + // gap // ........................................................................................................................ + sub v9.4S, v9.4S, v4.4S // .......................*................................................................................................ + // gap // ........................................................................................................................ + sub v4.4S, v6.4S, v11.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + add v6.4S, v6.4S, v11.4S // ..................................*..................................................................................... + // gap // ........................................................................................................................ + mls v18.4S, v17.4S, v8.S[0] // ......................*................................................................................................. + // gap // ........................................................................................................................ + mul v17.4S, v9.4S, v3.S[0] // .........................*.............................................................................................. + // gap // ........................................................................................................................ + mls v28.4S, v14.4S, v8.S[0] // ................................*....................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v9.4S, v3.S[1] // ..........................*............................................................................................. + // gap // ........................................................................................................................ + mul v11.4S, v4.4S, v0.S[2] // ...................................*.................................................................................... + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v4.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + mul v4.4S, v29.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + sqrdmulh v29.4S, v29.4S, v0.S[1] // ...................................................*.................................................................... + // gap // ........................................................................................................................ + mul v20.4S, v19.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v19.4S, v26.4S // .........................................................................................*.............................. + // gap // ........................................................................................................................ + mls v17.4S, v14.4S, v8.S[0] // ...........................*............................................................................................ + // gap // ........................................................................................................................ + mls v11.4S, v9.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + sub v14.4S, v13.4S, v22.4S // ......................................*................................................................................. + // gap // ........................................................................................................................ + mls v4.4S, v29.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + sub v29.4S, v18.4S, v17.4S // ...........................................*............................................................................ + // gap // ........................................................................................................................ + mul v9.4S, v14.4S, v1.S[0] // ........................................*............................................................................... + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v14.4S, v1.S[1] // .........................................*.............................................................................. + // gap // ........................................................................................................................ + add v17.4S, v18.4S, v17.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + mul v18.4S, v29.4S, v1.S[0] // .............................................*.......................................................................... + // gap // ........................................................................................................................ + sqrdmulh v29.4S, v29.4S, v1.S[1] // ..............................................*......................................................................... + // gap // ........................................................................................................................ + sub v13.4S, v6.4S, v17.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + add v17.4S, v6.4S, v17.4S // ......................................................*................................................................. + // gap // ........................................................................................................................ + mls v9.4S, v14.4S, v8.S[0] // ..........................................*............................................................................. + // gap // ........................................................................................................................ + mls v18.4S, v29.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + mul v6.4S, v13.4S, v0.S[0] // .......................................................*................................................................ + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v13.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + sub v29.4S, v28.4S, v9.4S // ..........................................................*............................................................. + // gap // ........................................................................................................................ + add v28.4S, v28.4S, v9.4S // ...........................................................*............................................................ + // gap // ........................................................................................................................ + sub v9.4S, v11.4S, v18.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + mls v6.4S, v14.4S, v8.S[0] // .........................................................*.............................................................. + // gap // ........................................................................................................................ + mul v14.4S, v29.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + sqrdmulh v29.4S, v29.4S, v0.S[1] // .............................................................*.......................................................... + // gap // ........................................................................................................................ + add v18.4S, v11.4S, v18.4S // ................................................................*....................................................... + // gap // ........................................................................................................................ + mul v11.4S, v9.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v9.4S, v0.S[1] // ..................................................................*..................................................... + // gap // ........................................................................................................................ + mls v14.4S, v29.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + cmge v29.4S, v31.4S, v4.4S // ....................................................................*................................................... + // gap // ........................................................................................................................ + cmge v13.4S, v4.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + mls v20.4S, v19.4S, v8.S[0] // ..........................................................................................*............................. + // gap // ........................................................................................................................ + mls v11.4S, v9.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + sub v19.4S, v29.4S, v13.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + cmge v29.4S, v31.4S, v6.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + cmge v9.4S, v6.4S, v30.4S // .........................................................................*.............................................. + // gap // ........................................................................................................................ + mls v4.4S, v19.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + sub v19.4S, v29.4S, v9.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + cmge v29.4S, v31.4S, v14.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + cmge v9.4S, v14.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + mls v6.4S, v19.4S, v8.4S // ...........................................................................*............................................ + // gap // ........................................................................................................................ + sub v19.4S, v29.4S, v9.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + cmge v29.4S, v31.4S, v11.4S // ................................................................................*....................................... + // gap // ........................................................................................................................ + cmge v9.4S, v11.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + mls v14.4S, v19.4S, v8.4S // ...............................................................................*........................................ + // gap // ........................................................................................................................ + sub v19.4S, v29.4S, v9.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + str q4, [x0, #512] // ....................................................................................*................................... + // gap // ........................................................................................................................ + mul v29.4S, v17.4S, v25.4S // ...........................................................................................*............................ + // gap // ........................................................................................................................ + mls v11.4S, v19.4S, v8.4S // ...................................................................................*.................................... + // gap // ........................................................................................................................ + str q6, [x0, #640] // .....................................................................................*.................................. + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v17.4S, v26.4S // ............................................................................................*........................... + // gap // ........................................................................................................................ + str q14, [x0, #768] // ......................................................................................*................................. + // gap // ........................................................................................................................ + mul v19.4S, v28.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + str q11, [x0, #896] // .......................................................................................*................................ + // gap // ........................................................................................................................ + mls v29.4S, v17.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + sqrdmulh v17.4S, v28.4S, v26.4S // ...............................................................................................*........................ + // gap // ........................................................................................................................ + mul v6.4S, v18.4S, v25.4S // .................................................................................................*...................... + // gap // ........................................................................................................................ + sqrdmulh v18.4S, v18.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + cmge v14.4S, v31.4S, v20.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + mls v19.4S, v17.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + cmge v17.4S, v20.4S, v30.4S // .....................................................................................................*.................. + // gap // ........................................................................................................................ + mls v6.4S, v18.4S, v8.S[0] // ...................................................................................................*.................... + // gap // ........................................................................................................................ + sub v17.4S, v14.4S, v17.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + cmge v18.4S, v31.4S, v29.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + cmge v14.4S, v29.4S, v30.4S // .........................................................................................................*.............. + // gap // ........................................................................................................................ + mls v20.4S, v17.4S, v8.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + sub v17.4S, v18.4S, v14.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + cmge v18.4S, v31.4S, v19.4S // ............................................................................................................*........... + // gap // ........................................................................................................................ + cmge v14.4S, v19.4S, v30.4S // .............................................................................................................*.......... + // gap // ........................................................................................................................ + mls v29.4S, v17.4S, v8.4S // ...........................................................................................................*............ + // gap // ........................................................................................................................ + sub v17.4S, v18.4S, v14.4S // ..............................................................................................................*......... + // gap // ........................................................................................................................ + cmge v18.4S, v31.4S, v6.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + cmge v14.4S, v6.4S, v30.4S // .................................................................................................................*...... + // gap // ........................................................................................................................ + mls v19.4S, v17.4S, v8.4S // ...............................................................................................................*........ + // gap // ........................................................................................................................ + sub v17.4S, v18.4S, v14.4S // ..................................................................................................................*..... + // gap // ........................................................................................................................ + str q20, [x0], #(16) // ....................................................................................................................*... + // gap // ........................................................................................................................ + ldr q18, [x0, #0] // e....................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v6.4S, v17.4S, v8.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + str q29, [x0, #112] // .....................................................................................................................*.. + // gap // ........................................................................................................................ + ldr q14, [x0, #128] // .e...................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q19, [x0, #240] // ......................................................................................................................*. + // gap // ........................................................................................................................ + ldr q11, [x0, #256] // ..e..................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q28, [x0, #512] // ....e................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q29, [x0, #640] // .....e.................................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q9, [x0, #768] // ......e................................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v13.4S, v28.4S, v29.4S // ...................e.................................................................................................... + // gap // ........................................................................................................................ + ldr q4, [x0, #896] // .......e................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q20, [x0, #384] // ...e.................................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v22.4S, v9.4S, v4.4S // ........................e............................................................................................... + // gap // ........................................................................................................................ + str q6, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + add v15.4S, v11.4S, v20.4S // ..............e......................................................................................................... + // gap // ........................................................................................................................ + add v23.4S, v13.4S, v22.4S // .......................................e................................................................................ + // gap // ........................................................................................................................ + + // original source code + // ldr q9, [x0, #0] // e...............|.......................................................................................................e............. + // ldr q10, [x0, #(1*(1024/8))] // ...e............|..........................................................................................................e.......... + // ldr q11, [x0, #(2*(1024/8))] // .....e..........|............................................................................................................e........ + // ldr q12, [x0, #(3*(1024/8))] // ...........e....|..................................................................................................................e.. + // ldr q13, [x0, #(4*(1024/8))] // ......e.........|.............................................................................................................e....... + // ldr q14, [x0, #(5*(1024/8))] // .......e........|..............................................................................................................e...... + // ldr q15, [x0, #(6*(1024/8))] // ........e.......|...............................................................................................................e..... + // ldr q16, [x0, #(7*(1024/8))] // ..........e.....|.................................................................................................................e... + // sub v24.4s, v9.4s, v10.4s // ................*..................................................................................................................... + // add v9.4s, v9.4s, v10.4s // ................|*.................................................................................................................... + // mul v10.4s, v24.4s, v1.s[2] // ................|..*.................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ................|...*................................................................................................................. + // mls v10.4s, v24.4s, v8.s[0] // ................|........*............................................................................................................ + // sub v24.4s, v11.4s, v12.4s // ................|.*................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ..............e.|..................................................................................................................... + // mul v12.4s, v24.4s, v2.s[0] // ................|......*.............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................|.......*............................................................................................................. + // mls v12.4s, v24.4s, v8.s[0] // ................|..............*...................................................................................................... + // sub v24.4s, v13.4s, v14.4s // ................|.........*........................................................................................................... + // add v13.4s, v13.4s, v14.4s // .........e......|................................................................................................................e.... + // mul v14.4s, v24.4s, v2.s[2] // ................|...............*..................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................|................*.................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................|....................*................................................................................................ + // sub v24.4s, v15.4s, v16.4s // ................|.................*................................................................................................... + // add v15.4s, v15.4s, v16.4s // ............e...|...................................................................................................................e. + // mul v16.4s, v24.4s, v3.s[0] // ................|.....................*............................................................................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ................|.......................*............................................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|..............................*...................................................................................... + // sub v24.4s, v9.4s, v11.4s // ................|....*................................................................................................................ + // add v9.4s, v9.4s, v11.4s // ................|.....*............................................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ................|..........*.......................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|...........*......................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ................|......................*.............................................................................................. + // sub v24.4s, v10.4s, v12.4s // ................|..................*.................................................................................................. + // add v10.4s, v10.4s, v12.4s // ................|...................*................................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ................|........................*............................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................|.........................*........................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................|...............................*..................................................................................... + // sub v24.4s, v13.4s, v15.4s // ................|................................*.................................................................................... + // add v13.4s, v13.4s, v15.4s // ...............e|..................................................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ................|...................................*................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|....................................*................................................................................ + // mls v15.4s, v24.4s, v8.s[0] // ................|..........................................*.......................................................................... + // sub v24.4s, v14.4s, v16.4s // ................|..................................*.................................................................................. + // add v14.4s, v14.4s, v16.4s // ................|.....................................*............................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ................|......................................*.............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ................|.......................................*............................................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................*......................................................................... + // sub v24.4s, v9.4s, v13.4s // ................|............*........................................................................................................ + // add v9.4s, v9.4s, v13.4s // ................|.............*....................................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ................|..........................*.......................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...........................*......................................................................................... + // mls v13.4s, v24.4s, v8.s[0] // ................|.................................*................................................................................... + // sub v24.4s, v10.4s, v14.4s // ................|........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ................|.........................................*........................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ................|............................................*........................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|.............................................*....................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ................|.................................................*................................................................... + // sub v24.4s, v11.4s, v15.4s // ................|..............................................*...................................................................... + // add v11.4s, v11.4s, v15.4s // ................|...............................................*..................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ................|..................................................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|...................................................*................................................................. + // mls v15.4s, v24.4s, v8.s[0] // ................|.......................................................*............................................................. + // sub v24.4s, v12.4s, v16.4s // ................|................................................*.................................................................... + // add v12.4s, v12.4s, v16.4s // ................|....................................................*................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ................|.....................................................*............................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................|......................................................*.............................................................. + // mls v16.4s, v24.4s, v8.s[0] // ................|...........................................................*......................................................... + // cmge v27.4s, v31.4s, v13.4s // ................|........................................................*............................................................ + // cmge v28.4s, v13.4s, v30.4s // ................|.........................................................*........................................................... + // sub v28.4s, v27.4s, v28.4s // ................|............................................................*........................................................ + // mls v13.4s, v28.4s, v8.4s // ................|...............................................................*..................................................... + // cmge v27.4s, v31.4s, v14.4s // ................|.............................................................*....................................................... + // cmge v28.4s, v14.4s, v30.4s // ................|..............................................................*...................................................... + // sub v28.4s, v27.4s, v28.4s // ................|................................................................*.................................................... + // mls v14.4s, v28.4s, v8.4s // ................|...................................................................*................................................. + // cmge v27.4s, v31.4s, v15.4s // ................|.................................................................*................................................... + // cmge v28.4s, v15.4s, v30.4s // ................|..................................................................*.................................................. + // sub v28.4s, v27.4s, v28.4s // ................|....................................................................*................................................ + // mls v15.4s, v28.4s, v8.4s // ................|.......................................................................*............................................. + // cmge v27.4s, v31.4s, v16.4s // ................|.....................................................................*............................................... + // cmge v28.4s, v16.4s, v30.4s // ................|......................................................................*.............................................. + // sub v28.4s, v27.4s, v28.4s // ................|........................................................................*............................................ + // mls v16.4s, v28.4s, v8.4s // ................|...........................................................................*......................................... + // str q13, [x0, #(4*(1024/8))] // ................|.........................................................................*........................................... + // str q14, [x0, #(5*(1024/8))] // ................|............................................................................*........................................ + // str q15, [x0, #(6*(1024/8))] // ................|..............................................................................*...................................... + // str q16, [x0, #(7*(1024/8))] // ................|................................................................................*.................................... + // mul v13.4s, v9.4s, v25.4s // ................|............................*........................................................................................ + // sqrdmulh v9.4s, v9.4s, v26.4s // ................|.............................*....................................................................................... + // mls v13.4s, v9.4s, v8.s[0] // ................|..........................................................*.......................................................... + // mul v14.4s, v10.4s, v25.4s // ................|..........................................................................*.......................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ................|.............................................................................*....................................... + // mls v14.4s, v10.4s, v8.s[0] // ................|.................................................................................*................................... + // mul v15.4s, v11.4s, v25.4s // ................|...............................................................................*..................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ................|..................................................................................*.................................. + // mls v15.4s, v11.4s, v8.s[0] // ................|......................................................................................*.............................. + // mul v16.4s, v12.4s, v25.4s // ................|...................................................................................*................................. + // sqrdmulh v12.4s, v12.4s, v26.4s // ................|....................................................................................*................................ + // mls v16.4s, v12.4s, v8.s[0] // ................|........................................................................................*............................ + // cmge v27.4s, v31.4s, v13.4s // ................|.....................................................................................*............................... + // cmge v28.4s, v13.4s, v30.4s // ................|.......................................................................................*............................. + // sub v28.4s, v27.4s, v28.4s // ................|.........................................................................................*........................... + // mls v13.4s, v28.4s, v8.4s // ................|............................................................................................*........................ + // cmge v27.4s, v31.4s, v14.4s // ................|..........................................................................................*.......................... + // cmge v28.4s, v14.4s, v30.4s // ................|...........................................................................................*......................... + // sub v28.4s, v27.4s, v28.4s // ................|.............................................................................................*....................... + // mls v14.4s, v28.4s, v8.4s // ................|................................................................................................*.................... + // cmge v27.4s, v31.4s, v15.4s // ................|..............................................................................................*...................... + // cmge v28.4s, v15.4s, v30.4s // ................|...............................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ................|.................................................................................................*................... + // mls v15.4s, v28.4s, v8.4s // ................|....................................................................................................*................ + // cmge v27.4s, v31.4s, v16.4s // ................|..................................................................................................*.................. + // cmge v28.4s, v16.4s, v30.4s // ................|...................................................................................................*................. + // sub v28.4s, v27.4s, v28.4s // ................|.....................................................................................................*............... + // mls v16.4s, v28.4s, v8.4s // .*..............|........................................................................................................*............ + // str q13, [x0], #(16) // ................|......................................................................................................*.............. + // str q14, [x0, #(-16 + 1*(1024/8))] // ..*.............|.........................................................................................................*........... + // str q15, [x0, #(-16 + 2*(1024/8))] // ....*...........|...........................................................................................................*......... + // str q16, [x0, #(-16 + 3*(1024/8))] // .............*..|....................................................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + sub v20.4S, v11.4S, v20.4S // ..*......................................................................................................... + // gap // ............................................................................................................ + sub v5.4S, v18.4S, v14.4S // *........................................................................................................... + // gap // ............................................................................................................ + sub v11.4S, v28.4S, v29.4S // ..........*................................................................................................. + // gap // ............................................................................................................ + sqrdmulh v29.4S, v20.4S, v2.S[1] // ........*................................................................................................... + // gap // ............................................................................................................ + mul v17.4S, v20.4S, v2.S[0] // .......*.................................................................................................... + // gap // ............................................................................................................ + sub v9.4S, v9.4S, v4.4S // ..................*......................................................................................... + // gap // ............................................................................................................ + sqrdmulh v24.4S, v5.4S, v1.S[3] // ....*....................................................................................................... + // gap // ............................................................................................................ + sqrdmulh v21.4S, v11.4S, v2.S[3] // .................*.......................................................................................... + // gap // ............................................................................................................ + mul v19.4S, v9.4S, v3.S[0] // ......................*..................................................................................... + // gap // ............................................................................................................ + mul v27.4S, v5.4S, v1.S[2] // ...*........................................................................................................ + // gap // ............................................................................................................ + mul v28.4S, v11.4S, v2.S[2] // ................*........................................................................................... + // gap // ............................................................................................................ + sqrdmulh v6.4S, v9.4S, v3.S[1] // ........................*................................................................................... + // gap // ............................................................................................................ + mls v17.4S, v29.4S, v8.S[0] // ...............*............................................................................................ + // gap // ............................................................................................................ + mls v27.4S, v24.4S, v8.S[0] // .........*.................................................................................................. + // gap // ............................................................................................................ + mls v28.4S, v21.4S, v8.S[0] // .....................*...................................................................................... + // gap // ............................................................................................................ + mls v19.4S, v6.4S, v8.S[0] // ...............................*............................................................................ + // gap // ............................................................................................................ + add v16.4S, v18.4S, v14.4S // .*.......................................................................................................... + // gap // ............................................................................................................ + sub v18.4S, v27.4S, v17.4S // ...................*........................................................................................ + // gap // ............................................................................................................ + add v7.4S, v27.4S, v17.4S // ....................*....................................................................................... + // gap // ............................................................................................................ + sub v17.4S, v28.4S, v19.4S // ...................................*........................................................................ + // gap // ............................................................................................................ + sqrdmulh v14.4S, v18.4S, v0.S[3] // ..........................*................................................................................. + // gap // ............................................................................................................ + mul v4.4S, v18.4S, v0.S[2] // .........................*.................................................................................. + // gap // ............................................................................................................ + sqrdmulh v18.4S, v17.4S, v1.S[1] // ........................................*................................................................... + // gap // ............................................................................................................ + mul v11.4S, v17.4S, v1.S[0] // .......................................*.................................................................... + // gap // ............................................................................................................ + add v27.4S, v28.4S, v19.4S // ......................................*..................................................................... + // gap // ............................................................................................................ + sub v28.4S, v16.4S, v15.4S // .....*...................................................................................................... + // gap // ............................................................................................................ + mls v4.4S, v14.4S, v8.S[0] // ................................*........................................................................... + // gap // ............................................................................................................ + mls v11.4S, v18.4S, v8.S[0] // ............................................*............................................................... + // gap // ............................................................................................................ + mul v20.4S, v28.4S, v0.S[2] // ...........*................................................................................................ + // gap // ............................................................................................................ + add v19.4S, v7.4S, v27.4S // ..........................................*................................................................. + // gap // ............................................................................................................ + sqrdmulh v21.4S, v28.4S, v0.S[3] // ............*............................................................................................... + // gap // ............................................................................................................ + add v17.4S, v4.4S, v11.4S // .....................................................*...................................................... + // gap // ............................................................................................................ + mul v28.4S, v19.4S, v25.4S // ...........................................................................*................................ + // gap // ............................................................................................................ + sub v14.4S, v13.4S, v22.4S // .................................*.......................................................................... + // gap // ............................................................................................................ + mul v9.4S, v17.4S, v25.4S // ....................................................................................*....................... + // gap // ............................................................................................................ + sqrdmulh v17.4S, v17.4S, v26.4S // .....................................................................................*...................... + // gap // ............................................................................................................ + mls v20.4S, v21.4S, v8.S[0] // .......................*.................................................................................... + // gap // ............................................................................................................ + sqrdmulh v18.4S, v14.4S, v1.S[1] // .....................................*...................................................................... + // gap // ............................................................................................................ + mul v13.4S, v14.4S, v1.S[0] // ....................................*....................................................................... + // gap // ............................................................................................................ + mls v9.4S, v17.4S, v8.S[0] // .........................................................................................*.................. + // gap // ............................................................................................................ + add v21.4S, v16.4S, v15.4S // ......*..................................................................................................... + // gap // ............................................................................................................ + sqrdmulh v10.4S, v19.4S, v26.4S // ..............................................................................*............................. + // gap // ............................................................................................................ + mls v13.4S, v18.4S, v8.S[0] // ...........................................*................................................................ + // gap // ............................................................................................................ + cmge v17.4S, v9.4S, v30.4S // ....................................................................................................*....... + // gap // ............................................................................................................ + cmge v19.4S, v31.4S, v9.4S // ...................................................................................................*........ + // gap // ............................................................................................................ + add v18.4S, v21.4S, v23.4S // ..............*............................................................................................. + // gap // ............................................................................................................ + sub v12.4S, v19.4S, v17.4S // ......................................................................................................*..... + // gap // ............................................................................................................ + add v17.4S, v20.4S, v13.4S // ................................................*........................................................... + // gap // ............................................................................................................ + sqrdmulh v16.4S, v18.4S, v26.4S // ..............................*............................................................................. + // gap // ............................................................................................................ + mls v9.4S, v12.4S, v8.4S // ........................................................................................................*... + // gap // ............................................................................................................ + mul v6.4S, v17.4S, v25.4S // ................................................................................*........................... + // gap // ............................................................................................................ + sqrdmulh v14.4S, v17.4S, v26.4S // ...................................................................................*........................ + // gap // ............................................................................................................ + mls v28.4S, v10.4S, v8.S[0] // ..................................................................................*......................... + // gap // ............................................................................................................ + mul v15.4S, v18.4S, v25.4S // .............................*.............................................................................. + // gap // ............................................................................................................ + sub v4.4S, v4.4S, v11.4S // .................................................*.......................................................... + // gap // ............................................................................................................ + mls v6.4S, v14.4S, v8.S[0] // .......................................................................................*.................... + // gap // ............................................................................................................ + cmge v11.4S, v31.4S, v28.4S // ...........................................................................................*................ + // gap // ............................................................................................................ + cmge v17.4S, v28.4S, v30.4S // ............................................................................................*............... + // gap // ............................................................................................................ + mls v15.4S, v16.4S, v8.S[0] // ...........................................................*................................................ + // gap // ............................................................................................................ + sub v18.4S, v11.4S, v17.4S // ..............................................................................................*............. + // gap // ............................................................................................................ + cmge v17.4S, v6.4S, v30.4S // ................................................................................................*........... + // gap // ............................................................................................................ + cmge v19.4S, v31.4S, v6.4S // ...............................................................................................*............ + // gap // ............................................................................................................ + mls v28.4S, v18.4S, v8.4S // .................................................................................................*.......... + // gap // ............................................................................................................ + cmge v18.4S, v31.4S, v15.4S // ......................................................................................*..................... + // gap // ............................................................................................................ + cmge v29.4S, v15.4S, v30.4S // ........................................................................................*................... + // gap // ............................................................................................................ + sub v14.4S, v20.4S, v13.4S // ...............................................*............................................................ + // gap // ............................................................................................................ + sub v29.4S, v18.4S, v29.4S // ..........................................................................................*................. + // gap // ............................................................................................................ + sqrdmulh v18.4S, v4.4S, v0.S[1] // .......................................................*.................................................... + // gap // ............................................................................................................ + mul v11.4S, v4.4S, v0.S[0] // ......................................................*..................................................... + // gap // ............................................................................................................ + mls v15.4S, v29.4S, v8.4S // .............................................................................................*.............. + // gap // ............................................................................................................ + sub v4.4S, v7.4S, v27.4S // .........................................*.................................................................. + // gap // ............................................................................................................ + sqrdmulh v29.4S, v14.4S, v0.S[1] // ....................................................*....................................................... + // gap // ............................................................................................................ + mls v11.4S, v18.4S, v8.S[0] // ............................................................*............................................... + // gap // ............................................................................................................ + sqrdmulh v22.4S, v4.4S, v0.S[1] // ..............................................*............................................................. + // gap // ............................................................................................................ + mul v20.4S, v4.4S, v0.S[0] // .............................................*.............................................................. + // gap // ............................................................................................................ + mul v4.4S, v14.4S, v0.S[0] // ...................................................*........................................................ + // gap // ............................................................................................................ + cmge v18.4S, v31.4S, v11.4S // ......................................................................*..................................... + // gap // ............................................................................................................ + cmge v14.4S, v11.4S, v30.4S // .......................................................................*.................................... + // gap // ............................................................................................................ + mls v20.4S, v22.4S, v8.S[0] // ..................................................*......................................................... + // gap // ............................................................................................................ + sub v24.4S, v18.4S, v14.4S // .........................................................................*.................................. + // gap // ............................................................................................................ + mls v4.4S, v29.4S, v8.S[0] // ........................................................*................................................... + // gap // ............................................................................................................ + sub v13.4S, v21.4S, v23.4S // .............*.............................................................................................. + // gap // ............................................................................................................ + mls v11.4S, v24.4S, v8.4S // ............................................................................*............................... + // gap // ............................................................................................................ + cmge v14.4S, v31.4S, v20.4S // ..............................................................*............................................. + // gap // ............................................................................................................ + cmge v22.4S, v31.4S, v4.4S // ..................................................................*......................................... + // gap // ............................................................................................................ + cmge v18.4S, v20.4S, v30.4S // ...............................................................*............................................ + // gap // ............................................................................................................ + str q11, [x0, #896] // .................................................................................*.......................... + // gap // ............................................................................................................ + sub v11.4S, v14.4S, v18.4S // .................................................................*.......................................... + // gap // ............................................................................................................ + cmge v18.4S, v4.4S, v30.4S // ...................................................................*........................................ + // gap // ............................................................................................................ + str q15, [x0], #(16) // .......................................................................................................*.... + // gap // ............................................................................................................ + mul v14.4S, v13.4S, v0.S[0] // ...........................*................................................................................ + // gap // ............................................................................................................ + sqrdmulh v27.4S, v13.4S, v0.S[1] // ............................*............................................................................... + // gap // ............................................................................................................ + sub v18.4S, v22.4S, v18.4S // .....................................................................*...................................... + // gap // ............................................................................................................ + mls v20.4S, v11.4S, v8.4S // ....................................................................*....................................... + // gap // ............................................................................................................ + str q28, [x0, #112] // .........................................................................................................*.. + // gap // ............................................................................................................ + mls v14.4S, v27.4S, v8.S[0] // ..................................*......................................................................... + // gap // ............................................................................................................ + mls v4.4S, v18.4S, v8.4S // ........................................................................*................................... + // gap // ............................................................................................................ + str q20, [x0, #624] // .............................................................................*.............................. + // gap // ............................................................................................................ + sub v17.4S, v19.4S, v17.4S // ..................................................................................................*......... + // gap // ............................................................................................................ + cmge v19.4S, v14.4S, v30.4S // ..........................................................*................................................. + // gap // ............................................................................................................ + cmge v18.4S, v31.4S, v14.4S // .........................................................*.................................................. + // gap // ............................................................................................................ + str q9, [x0, #368] // ...........................................................................................................* + // gap // ............................................................................................................ + sub v19.4S, v18.4S, v19.4S // .............................................................*.............................................. + // gap // ............................................................................................................ + mls v6.4S, v17.4S, v8.4S // .....................................................................................................*...... + // gap // ............................................................................................................ + str q4, [x0, #752] // ...............................................................................*............................ + // gap // ............................................................................................................ + mls v14.4S, v19.4S, v8.4S // ................................................................*........................................... + // gap // ............................................................................................................ + // gap // ............................................................................................................ + // gap // ............................................................................................................ + str q6, [x0, #240] // ..........................................................................................................*. + // gap // ............................................................................................................ + // gap // ............................................................................................................ + // gap // ............................................................................................................ + str q14, [x0, #496] // ..........................................................................*................................. + // gap // ............................................................................................................ + + // original source code + // sub v17.4S, v18.4S, v14.4S // .*.......................................................................................................... + // add v19.4S, v18.4S, v14.4S // ................*........................................................................................... + // sub v18.4S, v11.4S, v20.4S // *........................................................................................................... + // mul v6.4S, v17.4S, v1.S[2] // .........*.................................................................................................. + // sqrdmulh v17.4S, v17.4S, v1.S[3] // ......*..................................................................................................... + // sub v14.4S, v19.4S, v15.4S // .........................*.................................................................................. + // add v19.4S, v19.4S, v15.4S // ........................................*................................................................... + // mul v11.4S, v18.4S, v2.S[0] // ....*....................................................................................................... + // sqrdmulh v18.4S, v18.4S, v2.S[1] // ...*........................................................................................................ + // mls v6.4S, v17.4S, v8.S[0] // .............*.............................................................................................. + // sub v17.4S, v28.4S, v29.4S // ..*......................................................................................................... + // mul v28.4S, v14.4S, v0.S[2] // ............................*............................................................................... + // sqrdmulh v14.4S, v14.4S, v0.S[3] // ..............................*............................................................................. + // sub v29.4S, v19.4S, v23.4S // .................................................................................*.......................... + // add v19.4S, v19.4S, v23.4S // .............................................*.............................................................. + // mls v11.4S, v18.4S, v8.S[0] // ............*............................................................................................... + // mul v18.4S, v17.4S, v2.S[2] // ..........*................................................................................................. + // sqrdmulh v17.4S, v17.4S, v2.S[3] // .......*.................................................................................................... + // sub v9.4S, v9.4S, v4.4S // .....*...................................................................................................... + // sub v4.4S, v6.4S, v11.4S // .................*.......................................................................................... + // add v6.4S, v6.4S, v11.4S // ..................*......................................................................................... + // mls v18.4S, v17.4S, v8.S[0] // ..............*............................................................................................. + // mul v17.4S, v9.4S, v3.S[0] // ........*................................................................................................... + // mls v28.4S, v14.4S, v8.S[0] // ....................................*....................................................................... + // sqrdmulh v14.4S, v9.4S, v3.S[1] // ...........*................................................................................................ + // mul v11.4S, v4.4S, v0.S[2] // .....................*...................................................................................... + // sqrdmulh v9.4S, v4.4S, v0.S[3] // ....................*....................................................................................... + // mul v4.4S, v29.4S, v0.S[0] // ..........................................................................................*................. + // sqrdmulh v29.4S, v29.4S, v0.S[1] // ...........................................................................................*................ + // mul v20.4S, v19.4S, v25.4S // .....................................................*...................................................... + // sqrdmulh v19.4S, v19.4S, v26.4S // ................................................*........................................................... + // mls v17.4S, v14.4S, v8.S[0] // ...............*............................................................................................ + // mls v11.4S, v9.4S, v8.S[0] // ..........................*................................................................................. + // sub v14.4S, v13.4S, v22.4S // .................................*.......................................................................... + // mls v4.4S, v29.4S, v8.S[0] // ...............................................................................................*............ + // sub v29.4S, v18.4S, v17.4S // ...................*........................................................................................ + // mul v9.4S, v14.4S, v1.S[0] // ......................................*..................................................................... + // sqrdmulh v14.4S, v14.4S, v1.S[1] // .....................................*...................................................................... + // add v17.4S, v18.4S, v17.4S // ........................*................................................................................... + // mul v18.4S, v29.4S, v1.S[0] // .......................*.................................................................................... + // sqrdmulh v29.4S, v29.4S, v1.S[1] // ......................*..................................................................................... + // sub v13.4S, v6.4S, v17.4S // ......................................................................*..................................... + // add v17.4S, v6.4S, v17.4S // .............................*.............................................................................. + // mls v9.4S, v14.4S, v8.S[0] // ..........................................*................................................................. + // mls v18.4S, v29.4S, v8.S[0] // ...........................*................................................................................ + // mul v6.4S, v13.4S, v0.S[0] // ..........................................................................*................................. + // sqrdmulh v14.4S, v13.4S, v0.S[1] // .........................................................................*.................................. + // sub v29.4S, v28.4S, v9.4S // .................................................................*.......................................... + // add v28.4S, v28.4S, v9.4S // ...............................................*............................................................ + // sub v9.4S, v11.4S, v18.4S // ......................................................*..................................................... + // mls v6.4S, v14.4S, v8.S[0] // ..............................................................................*............................. + // mul v14.4S, v29.4S, v0.S[0] // ...........................................................................*................................ + // sqrdmulh v29.4S, v29.4S, v0.S[1] // .......................................................................*.................................... + // add v18.4S, v11.4S, v18.4S // ...............................*............................................................................ + // mul v11.4S, v9.4S, v0.S[0] // ....................................................................*....................................... + // sqrdmulh v9.4S, v9.4S, v0.S[1] // ...................................................................*........................................ + // mls v14.4S, v29.4S, v8.S[0] // ................................................................................*........................... + // cmge v29.4S, v31.4S, v4.4S // ....................................................................................................*....... + // cmge v13.4S, v4.4S, v30.4S // ...................................................................................................*........ + // mls v20.4S, v19.4S, v8.S[0] // ..........................................................*................................................. + // mls v11.4S, v9.4S, v8.S[0] // ........................................................................*................................... + // sub v19.4S, v29.4S, v13.4S // ......................................................................................................*..... + // cmge v29.4S, v31.4S, v6.4S // ...................................................................................*........................ + // cmge v9.4S, v6.4S, v30.4S // .....................................................................................*...................... + // mls v4.4S, v19.4S, v8.4S // .........................................................................................................*.. + // sub v19.4S, v29.4S, v9.4S // .......................................................................................*.................... + // cmge v29.4S, v31.4S, v14.4S // ....................................................................................*....................... + // cmge v9.4S, v14.4S, v30.4S // ........................................................................................*................... + // mls v6.4S, v19.4S, v8.4S // .............................................................................................*.............. + // sub v19.4S, v29.4S, v9.4S // ............................................................................................*............... + // cmge v29.4S, v31.4S, v11.4S // ............................................................................*............................... + // cmge v9.4S, v11.4S, v30.4S // .............................................................................*.............................. + // mls v14.4S, v19.4S, v8.4S // ................................................................................................*........... + // sub v19.4S, v29.4S, v9.4S // ...............................................................................*............................ + // str q4, [x0, #512] // ...........................................................................................................* + // mul v29.4S, v17.4S, v25.4S // ................................*........................................................................... + // mls v11.4S, v19.4S, v8.4S // ..................................................................................*......................... + // str q6, [x0, #640] // .................................................................................................*.......... + // sqrdmulh v17.4S, v17.4S, v26.4S // .........................................*.................................................................. + // str q14, [x0, #768] // ........................................................................................................*... + // mul v19.4S, v28.4S, v25.4S // ..................................................*......................................................... + // str q11, [x0, #896] // ......................................................................................*..................... + // mls v29.4S, v17.4S, v8.S[0] // ....................................................*....................................................... + // sqrdmulh v17.4S, v28.4S, v26.4S // ...................................................*........................................................ + // mul v6.4S, v18.4S, v25.4S // ..................................*......................................................................... + // sqrdmulh v18.4S, v18.4S, v26.4S // ...................................*........................................................................ + // cmge v14.4S, v31.4S, v20.4S // ...............................................................*............................................ + // mls v19.4S, v17.4S, v8.S[0] // .......................................................*.................................................... + // cmge v17.4S, v20.4S, v30.4S // ................................................................*........................................... + // mls v6.4S, v18.4S, v8.S[0] // .......................................*.................................................................... + // sub v17.4S, v14.4S, v17.4S // ..................................................................*......................................... + // cmge v18.4S, v31.4S, v29.4S // ........................................................*................................................... + // cmge v14.4S, v29.4S, v30.4S // .........................................................*.................................................. + // mls v20.4S, v17.4S, v8.4S // .....................................................................*...................................... + // sub v17.4S, v18.4S, v14.4S // ...........................................................*................................................ + // cmge v18.4S, v31.4S, v19.4S // .............................................................*.............................................. + // cmge v14.4S, v19.4S, v30.4S // ............................................................*............................................... + // mls v29.4S, v17.4S, v8.4S // ..............................................................*............................................. + // sub v17.4S, v18.4S, v14.4S // ..................................................................................................*......... + // cmge v18.4S, v31.4S, v6.4S // ............................................*............................................................... + // cmge v14.4S, v6.4S, v30.4S // ...........................................*................................................................ + // mls v19.4S, v17.4S, v8.4S // .......................................................................................................*.... + // sub v17.4S, v18.4S, v14.4S // ..............................................*............................................................. + // str q20, [x0], #(16) // .........................................................................................*.................. + // mls v6.4S, v17.4S, v8.4S // .................................................*.......................................................... + // str q29, [x0, #112] // ..............................................................................................*............. + // str q19, [x0, #240] // ..........................................................................................................*. + // str q6, [x0, #368] // .....................................................................................................*...... + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_opt_a72.s b/examples/opt/aarch64/intt_dilithium_123_45678_opt_a72.s new file mode 100644 index 00000000..d3b6904f --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_123_45678_opt_a72.s @@ -0,0 +1,2483 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, consts +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_opt_a72 + .global _intt_dilithium_123_45678_opt_a72 + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_opt_a72: +_intt_dilithium_123_45678_opt_a72: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ldr q20, [x5, #16] // ..........*................................................................................................................................. + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2] // ..............................*............................................................................................................. + ld4 {v21.4S, v22.4S, v23.4S, v24.4S}, [x1] // .......*.................................................................................................................................... + ldr q1, [x5, #96] // ..*......................................................................................................................................... + ldr q4, [x5, #112] // .......................................*.................................................................................................... + // gap // ............................................................................................................................................ + ldr q7, [x5, #48] // ........*................................................................................................................................... + ldr q5, [x5, #64] // .*.......................................................................................................................................... + // gap // ............................................................................................................................................ + ldr q6, [x5, #32] // ....*....................................................................................................................................... + ldr q30, [x5, #144] // .....*...................................................................................................................................... + // gap // ............................................................................................................................................ + sub v10.4S, v23.4S, v24.4S // ...........*................................................................................................................................ + sub v2.4S, v14.4S, v15.4S // ..................................*......................................................................................................... + ldr q25, [x5, #128] // ......*..................................................................................................................................... + add v29.4S, v16.4S, v17.4S // ....................................*....................................................................................................... + add v3.4S, v14.4S, v15.4S // .....................................*...................................................................................................... + ldr q0, [x5, #160] // ......................................*..................................................................................................... + sub v13.4S, v16.4S, v17.4S // ..........................................*................................................................................................. + add v14.4S, v23.4S, v24.4S // ............*............................................................................................................................... + ldr q18, [x5, #176] // ...*........................................................................................................................................ + sqrdmulh v24.4S, v2.4S, v30.4S // ...........................................*................................................................................................ + add v28.4S, v21.4S, v22.4S // .............*.............................................................................................................................. + ldr q15, [x5], #(12*16) // .....................*...................................................................................................................... + sub v9.4S, v3.4S, v29.4S // ............................................*............................................................................................... + ldr q16, [x5, #-112] // .........*.................................................................................................................................. + // gap // ............................................................................................................................................ + sub v22.4S, v21.4S, v22.4S // ..............*............................................................................................................................. + mul v19.4S, v2.4S, v25.4S // ........................................*................................................................................................... + // gap // ............................................................................................................................................ + sub v12.4S, v28.4S, v14.4S // ..................*......................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v2.4S, v13.4S, v18.4S // ...............................................*............................................................................................ + add v18.4S, v28.4S, v14.4S // .................*.......................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v14.4S, v9.4S, v4.4S // .................................................*.......................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v13.4S, v13.4S, v0.4S // ..............................................................*............................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v19.4S, v24.4S, v8.S[0] // ...................................................*........................................................................................ + add v24.4S, v3.4S, v29.4S // ..............................................*............................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v13.4S, v2.4S, v8.S[0] // ................................................................*........................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v31.4S, v9.4S, v1.4S // ......................................................*..................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v31.4S, v14.4S, v8.S[0] // ..........................................................*................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v26.4S, v19.4S, v13.4S // .......................................................................*.................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v3.4S, v22.4S, v7.4S // ...................*........................................................................................................................ + add v11.4S, v19.4S, v13.4S // ...........................................................................*................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v19.4S, v26.4S, v4.4S // ..........................................................................*................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v14.4S, v24.4S, v11.4S // ..............................................................................*............................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v7.4S, v24.4S, v11.4S // ...................................................................................*........................................................ + sqrdmulh v11.4S, v12.4S, v20.4S // ........................*................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v4.4S, v26.4S, v1.4S // ...............................................................................*............................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v4.4S, v19.4S, v8.S[0] // ................................................................................*........................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v29.4S, v10.4S, v5.4S // ...............*............................................................................................................................ + ldr q0, [x4, #32] // .........................................*.................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v26.4S, v12.4S, v15.4S // ..........................*................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v30.4S, v10.4S, v16.4S // ................*........................................................................................................................... + trn1 v17.4S, v31.4S, v4.4S // ......................................................................................*..................................................... + // gap // ............................................................................................................................................ + trn2 v23.4S, v31.4S, v4.4S // ........................................................................................*................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v31.4S, v22.4S, v6.4S // ....................*....................................................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v12.2D, v14.2D, v17.2D // ..........................................................................................*................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v31.4S, v3.4S, v8.S[0] // .......................*.................................................................................................................... + trn2 v3.2D, v7.2D, v23.2D // ............................................................................................*............................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v29.4S, v30.4S, v8.S[0] // ......................*..................................................................................................................... + ldr q30, [x4, #48] // *........................................................................................................................................... + // gap // ............................................................................................................................................ + sub v25.4S, v12.4S, v3.4S // .................................................................................................*.......................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v26.4S, v11.4S, v8.S[0] // ...............................*............................................................................................................ + trn1 v11.2D, v7.2D, v23.2D // ...............................................................................................*............................................ + // gap // ............................................................................................................................................ + trn1 v23.2D, v14.2D, v17.2D // .............................................................................................*.............................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v5.4S, v12.4S, v3.4S // ..................................................................................................*......................................... + sqrdmulh v4.4S, v25.4S, v30.S[1] // ..........................................................................................................*................................. + // gap // ............................................................................................................................................ + sub v3.4S, v31.4S, v29.4S // .........................*.................................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v22.4S, v25.4S, v30.S[0] // ......................................................................................................*..................................... + add v16.4S, v23.4S, v11.4S // .....................................................................................................*...................................... + // gap // ............................................................................................................................................ + add v13.4S, v31.4S, v29.4S // ...........................*................................................................................................................ + ldr q31, [x4, #16] // .....................................................*...................................................................................... + // gap // ............................................................................................................................................ + sub v24.4S, v23.4S, v11.4S // ....................................................................................................*....................................... + mul v2.4S, v3.4S, v15.4S // .................................*.......................................................................................................... + // gap // ............................................................................................................................................ + sub v1.4S, v16.4S, v5.4S // .........................................................................................................*.................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v16.4S, v16.4S, v5.4S // ........................................................................................................*................................... + sqrdmulh v21.4S, v3.4S, v20.4S // ............................*............................................................................................................... + // gap // ............................................................................................................................................ + trn2 v3.4S, v18.4S, v13.4S // .............................*.............................................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v18.4S, v18.4S, v13.4S // ................................*........................................................................................................... + mls v22.4S, v4.4S, v8.S[0] // ...............................................................................................................*............................ + // gap // ............................................................................................................................................ + srshr v29.4S, v16.4S, #23 // ...........................................................................................................*................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v25.4S, v24.4S, v0.S[3] // .......................................................................................................*.................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v2.4S, v21.4S, v8.S[0] // ...................................*........................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v14.4S, v24.4S, v0.S[2] // ............................................................................................................*............................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v14.4S, v25.4S, v8.S[0] // .............................................................................................................*.............................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn1 v23.4S, v26.4S, v2.4S // .............................................*.............................................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v30.4S, v26.4S, v2.4S // ................................................*........................................................................................... + sqrdmulh v12.4S, v1.4S, v31.S[1] // ..............................................................................................................*............................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v9.4S, v1.4S, v31.S[0] // ................................................................................................................*........................... + trn1 v11.2D, v18.2D, v23.2D // ..................................................*......................................................................................... + // gap // ............................................................................................................................................ + sub v4.4S, v14.4S, v22.4S // ..................................................................................................................*......................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v16.4S, v29.4S, v8.4S // .................................................................................................................*.......................... + add v22.4S, v14.4S, v22.4S // ....................................................................................................................*....................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v6.4S, v4.4S, v31.S[0] // .....................................................................................................................*...................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + srshr v2.4S, v22.4S, #23 // .........................................................................................................................*.................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v9.4S, v12.4S, v8.S[0] // ...................................................................................................................*........................ + trn1 v29.2D, v3.2D, v30.2D // ....................................................*....................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v12.4S, v4.4S, v31.S[1] // .......................................................................................................................*.................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v19.4S, v11.4S, v29.4S // ........................................................*................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + trn2 v20.2D, v18.2D, v23.2D // .........................................................*.................................................................................. + mls v22.4S, v2.4S, v8.4S // ..............................................................................................................................*............. + // gap // ............................................................................................................................................ + trn2 v23.2D, v3.2D, v30.2D // .......................................................*.................................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v10.4S, v19.4S, v31.S[2] // .....................................................................*...................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v2.4S, v11.4S, v29.4S // ...........................................................*................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v19.4S, v19.4S, v31.S[3] // ............................................................*............................................................................... + add v27.4S, v20.4S, v23.4S // .............................................................*.............................................................................. + // gap // ............................................................................................................................................ + sub v15.4S, v20.4S, v23.4S // ...............................................................*............................................................................ + ldr q20, [x4], #64 // ..................................................................*......................................................................... + // gap // ............................................................................................................................................ + mls v6.4S, v12.4S, v8.S[0] // ...............................................................................................................................*............ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v28.4S, v2.4S, v27.4S // .................................................................*.......................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v26.4S, v2.4S, v27.4S // ...................................................................*........................................................................ + sqrdmulh v2.4S, v15.4S, v0.S[1] // ....................................................................*....................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v0.4S, v15.4S, v0.S[0] // ........................................................................*................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + srshr v13.4S, v26.4S, #23 // ......................................................................*..................................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v23.4S, v28.4S, v20.S[3] // ............................................................................*............................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v0.4S, v2.4S, v8.S[0] // .........................................................................*.................................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v10.4S, v19.4S, v8.S[0] // .............................................................................*.............................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v26.4S, v13.4S, v8.4S // ..................................................................................*......................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v11.4S, v28.4S, v20.S[2] // .....................................................................................*...................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v27.4S, v10.4S, v0.4S // .................................................................................*.......................................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v11.4S, v23.4S, v8.S[0] // .......................................................................................*.................................................... + add v14.4S, v10.4S, v0.4S // ....................................................................................*....................................................... + // gap // ............................................................................................................................................ + sub v2.4S, v26.4S, v16.4S // ......................................................................................................................*..................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v0.4S, v26.4S, v16.4S // ........................................................................................................................*................... + sqrdmulh v4.4S, v27.4S, v20.S[3] // ...........................................................................................*................................................ + // gap // ............................................................................................................................................ + srshr v23.4S, v14.4S, #23 // .........................................................................................*.................................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mul v7.4S, v27.4S, v20.S[2] // ..............................................................................................*............................................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q0, [x1], #(16*4) // ............................................................................................................................*............... + sub v19.4S, v11.4S, v9.4S // ...........................................................................................................................*................ + // gap // ............................................................................................................................................ + add v9.4S, v11.4S, v9.4S // .............................................................................................................................*.............. + mls v14.4S, v23.4S, v8.4S // ................................................................................................*........................................... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v16.4S, v2.4S, v20.S[1] // ..........................................................................................................................*................. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v7.4S, v4.4S, v8.S[0] // ...................................................................................................*........................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sub v15.4S, v14.4S, v22.4S // .................................................................................................................................*.......... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + add v30.4S, v14.4S, v22.4S // ..................................................................................................................................*......... + mul v2.4S, v2.4S, v20.S[0] // ................................................................................................................................*........... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + mls v2.4S, v16.4S, v8.S[0] // ...................................................................................................................................*........ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q30, [x1, #-48] // .......................................................................................................................................*.... + sub v31.4S, v7.4S, v6.4S // ....................................................................................................................................*....... + // gap // ............................................................................................................................................ + add v4.4S, v7.4S, v6.4S // .....................................................................................................................................*...... + mul v23.4S, v19.4S, v20.S[0] // ......................................................................................................................................*..... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + sqrdmulh v11.4S, v15.4S, v20.S[1] // ........................................................................................................................................*... + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q4, [x1, #-16] // ..........................................................................................................................................*. + // gap // ............................................................................................................................................ + // gap // ............................................................................................................................................ + str q2, [x2], #(16*4) // .........................................................................................................................................*.. + sqrdmulh v4.4S, v31.4S, v20.S[1] // ...........................................................................................................................................* + // gap // ............................................................................................................................................ + + // original source code + // ldr q21, [x4, #48] // .......................................................*.................................................................................... + // ldr q24, [x5, #64] // ......*..................................................................................................................................... + // ldr q3, [x5, #96] // ...*........................................................................................................................................ + // ldr q10, [x5, #176] // .................*.......................................................................................................................... + // ldr q7, [x5, #32] // .......*.................................................................................................................................... + // ldr q12, [x5, #144] // ........*................................................................................................................................... + // ldr q22, [x5, #128] // ...........*................................................................................................................................ + // ld4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1] // ..*......................................................................................................................................... + // ldr q29, [x5, #48] // .....*...................................................................................................................................... + // ldr q20, [x5, #80] // ......................*..................................................................................................................... + // ldr q18, [x5, #16] // *........................................................................................................................................... + // sub v9.4S, v27.4S, v28.4S // .........*.................................................................................................................................. + // add v27.4S, v27.4S, v28.4S // ................*........................................................................................................................... + // add v19.4S, v25.4S, v26.4S // ...................*........................................................................................................................ + // sub v26.4S, v25.4S, v26.4S // .......................*.................................................................................................................... + // mul v6.4S, v9.4S, v24.4S // ............................................*............................................................................................... + // sqrdmulh v5.4S, v9.4S, v20.4S // ...............................................*............................................................................................ + // add v31.4S, v19.4S, v27.4S // ...........................*................................................................................................................ + // sub v25.4S, v19.4S, v27.4S // .........................*.................................................................................................................. + // sqrdmulh v15.4S, v26.4S, v29.4S // ....................................*....................................................................................................... + // mul v2.4S, v26.4S, v7.4S // ..................................................*......................................................................................... + // ldr q14, [x5], #(12*16) // ....................*....................................................................................................................... + // mls v6.4S, v5.4S, v8.S[0] // ......................................................*..................................................................................... + // mls v2.4S, v15.4S, v8.S[0] // ....................................................*....................................................................................... + // sqrdmulh v15.4S, v25.4S, v18.4S // .........................................*.................................................................................................. + // sub v16.4S, v2.4S, v6.4S // ..............................................................*............................................................................. + // mul v13.4S, v25.4S, v14.4S // ..............................................*............................................................................................. + // add v28.4S, v2.4S, v6.4S // .................................................................*.......................................................................... + // sqrdmulh v11.4S, v16.4S, v18.4S // .......................................................................*.................................................................... + // trn2 v23.4S, v31.4S, v28.4S // ........................................................................*................................................................... + // ld4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x2] // .*.......................................................................................................................................... + // mls v13.4S, v15.4S, v8.S[0] // .........................................................*.................................................................................. + // trn1 v17.4S, v31.4S, v28.4S // .........................................................................*.................................................................. + // mul v0.4S, v16.4S, v14.4S // ....................................................................*....................................................................... + // sub v26.4S, v4.4S, v5.4S // ..........*................................................................................................................................. + // mls v0.4S, v11.4S, v8.S[0] // .............................................................................*.............................................................. + // add v31.4S, v6.4S, v7.4S // ............*............................................................................................................................... + // add v24.4S, v4.4S, v5.4S // .............*.............................................................................................................................. + // ldr q11, [x5, #-32] // ..............*............................................................................................................................. + // ldr q18, [x5, #-80] // ....*....................................................................................................................................... + // mul v9.4S, v26.4S, v22.4S // ........................*................................................................................................................... + // ldr q5, [x4, #32] // .............................................*.............................................................................................. + // sub v16.4S, v6.4S, v7.4S // ...............*............................................................................................................................ + // sqrdmulh v30.4S, v26.4S, v12.4S // ..................*......................................................................................................................... + // sub v15.4S, v24.4S, v31.4S // .....................*...................................................................................................................... + // trn1 v26.4S, v13.4S, v0.4S // ................................................................................*........................................................... + // add v24.4S, v24.4S, v31.4S // ...............................*............................................................................................................ + // sqrdmulh v27.4S, v16.4S, v10.4S // ..........................*................................................................................................................. + // trn2 v6.4S, v13.4S, v0.4S // .................................................................................*.......................................................... + // sqrdmulh v25.4S, v15.4S, v18.4S // ............................*............................................................................................................... + // trn1 v19.2D, v17.2D, v26.2D // ....................................................................................*....................................................... + // mls v9.4S, v30.4S, v8.S[0] // ..............................*............................................................................................................. + // trn1 v28.2D, v23.2D, v6.2D // ...........................................................................................*................................................ + // ldr q7, [x4, #16] // ..................................................................*......................................................................... + // mul v0.4S, v15.4S, v3.4S // .................................*.......................................................................................................... + // trn2 v15.2D, v23.2D, v6.2D // ................................................................................................*........................................... + // sub v4.4S, v19.4S, v28.4S // .............................................................................................*.............................................. + // trn2 v23.2D, v17.2D, v26.2D // ..............................................................................................*............................................. + // mls v0.4S, v25.4S, v8.S[0] // ..................................*......................................................................................................... + // add v13.4S, v19.4S, v28.4S // ..................................................................................................*......................................... + // sqrdmulh v26.4S, v4.4S, v7.S[3] // ...................................................................................................*........................................ + // add v1.4S, v23.4S, v15.4S // ....................................................................................................*....................................... + // mul v31.4S, v16.4S, v11.4S // .............................*.............................................................................................................. + // sub v29.4S, v23.4S, v15.4S // .....................................................................................................*...................................... + // mls v31.4S, v27.4S, v8.S[0] // ................................*........................................................................................................... + // sub v27.4S, v13.4S, v1.4S // ........................................................................................................*................................... + // ldr q20, [x4], #64 // ......................................................................................................*..................................... + // add v25.4S, v13.4S, v1.4S // .........................................................................................................*.................................. + // sqrdmulh v6.4S, v29.4S, v5.S[1] // ..........................................................................................................*................................. + // mul v28.4S, v4.4S, v7.S[2] // .................................................................................................*.......................................... + // srshr v15.4S, v25.4S, #23 // ............................................................................................................*............................... + // sub v17.4S, v9.4S, v31.4S // ...................................*........................................................................................................ + // mul v13.4S, v29.4S, v5.S[0] // ...........................................................................................................*................................ + // mls v13.4S, v6.4S, v8.S[0] // ..............................................................................................................*............................. + // sqrdmulh v16.4S, v17.4S, v18.4S // ......................................*..................................................................................................... + // add v10.4S, v9.4S, v31.4S // .....................................*...................................................................................................... + // sqrdmulh v23.4S, v27.4S, v20.S[3] // .............................................................................................................*.............................. + // mls v28.4S, v26.4S, v8.S[0] // ...............................................................................................................*............................ + // trn1 v30.4S, v24.4S, v10.4S // .......................................*.................................................................................................... + // mul v12.4S, v17.4S, v3.4S // ..........................................*................................................................................................. + // mls v12.4S, v16.4S, v8.S[0] // ...........................................*................................................................................................ + // sub v22.4S, v28.4S, v13.4S // ..................................................................................................................*......................... + // mls v25.4S, v15.4S, v8.4S // ................................................................................................................*........................... + // trn2 v15.4S, v24.4S, v10.4S // ........................................*................................................................................................... + // add v31.4S, v28.4S, v13.4S // ....................................................................................................................*....................... + // mul v1.4S, v27.4S, v20.S[2] // .................................................................................................................*.......................... + // trn1 v2.4S, v0.4S, v12.4S // ................................................*........................................................................................... + // mls v1.4S, v23.4S, v8.S[0] // ...................................................................................................................*........................ + // trn2 v10.4S, v0.4S, v12.4S // .................................................*.......................................................................................... + // srshr v28.4S, v31.4S, #23 // ........................................................................................................................*................... + // trn2 v11.2D, v30.2D, v2.2D // ...................................................*........................................................................................ + // sqrdmulh v19.4S, v22.4S, v20.S[3] // .......................................................................................................................*.................... + // trn2 v12.2D, v15.2D, v10.2D // .....................................................*...................................................................................... + // trn1 v14.2D, v30.2D, v2.2D // ...........................................................*................................................................................ + // mul v6.4S, v22.4S, v20.S[2] // .........................................................................................................................*.................. + // trn1 v2.2D, v15.2D, v10.2D // ..........................................................*................................................................................. + // mls v31.4S, v28.4S, v8.4S // .............................................................................................................................*.............. + // sub v13.4S, v11.4S, v12.4S // ........................................................*................................................................................... + // add v23.4S, v11.4S, v12.4S // ............................................................*............................................................................... + // mls v6.4S, v19.4S, v8.S[0] // ...............................................................................................................................*............ + // sub v17.4S, v14.4S, v2.4S // ...................................................................*........................................................................ + // add v26.4S, v14.4S, v2.4S // ................................................................*........................................................................... + // mul v24.4S, v13.4S, v21.S[0] // ...............................................................*............................................................................ + // sqrdmulh v19.4S, v17.4S, v5.S[3] // ............................................................................*............................................................... + // add v16.4S, v26.4S, v23.4S // ......................................................................*..................................................................... + // sub v18.4S, v26.4S, v23.4S // .....................................................................*...................................................................... + // sqrdmulh v23.4S, v13.4S, v21.S[1] // .............................................................*.............................................................................. + // srshr v27.4S, v16.4S, #23 // ...........................................................................*................................................................ + // mul v17.4S, v17.4S, v5.S[2] // ..............................................................................*............................................................. + // mls v17.4S, v19.4S, v8.S[0] // ...............................................................................*............................................................ + // sqrdmulh v2.4S, v18.4S, v7.S[1] // ..................................................................................*......................................................... + // mls v24.4S, v23.4S, v8.S[0] // ..........................................................................*................................................................. + // mul v10.4S, v18.4S, v7.S[0] // ...................................................................................*........................................................ + // mls v16.4S, v27.4S, v8.4S // ......................................................................................*..................................................... + // sub v19.4S, v17.4S, v24.4S // .....................................................................................*...................................................... + // mls v10.4S, v2.4S, v8.S[0] // ..........................................................................................*................................................. + // add v2.4S, v17.4S, v24.4S // .......................................................................................*.................................................... + // mul v14.4S, v19.4S, v7.S[0] // ........................................................................................*................................................... + // sub v12.4S, v25.4S, v16.4S // .....................................................................................................................*...................... + // sqrdmulh v24.4S, v19.4S, v7.S[1] // ............................................................................................*............................................... + // add v9.4S, v25.4S, v16.4S // ......................................................................................................................*..................... + // srshr v16.4S, v2.4S, #23 // .........................................................................................*.................................................. + // sqrdmulh v0.4S, v12.4S, v20.S[1] // ..............................................................................................................................*............. + // sub v19.4S, v1.4S, v10.4S // ...........................................................................................................................*................ + // str q9, [x1], #(16*4) // ..........................................................................................................................*................. + // add v9.4S, v1.4S, v10.4S // ............................................................................................................................*............... + // mls v2.4S, v16.4S, v8.4S // ...............................................................................................*............................................ + // mls v14.4S, v24.4S, v8.S[0] // .......................................................................................................*.................................... + // mul v13.4S, v12.4S, v20.S[0] // ..................................................................................................................................*......... + // sub v15.4S, v31.4S, v2.4S // ................................................................................................................................*........... + // add v2.4S, v31.4S, v2.4S // .................................................................................................................................*.......... + // mls v13.4S, v0.4S, v8.S[0] // ...................................................................................................................................*........ + // sub v31.4S, v6.4S, v14.4S // .....................................................................................................................................*...... + // add v27.4S, v6.4S, v14.4S // ......................................................................................................................................*..... + // mul v23.4S, v19.4S, v20.S[0] // .......................................................................................................................................*.... + // str q2, [x1, #-48] // ....................................................................................................................................*....... + // sqrdmulh v11.4S, v15.4S, v20.S[1] // ........................................................................................................................................*... + // str q13, [x2], #(16*4) // ..........................................................................................................................................*. + // str q27, [x1, #-16] // .........................................................................................................................................*.. + // sqrdmulh v4.4S, v31.4S, v20.S[1] // ...........................................................................................................................................* + + sub count, count, #1 +layer45678_start: + ldr q21, [x4, #48] // .........................................................................e.............................................................................. + ldr q24, [x5, #64] // ......e................................................................................................................................................. + // gap // ........................................................................................................................................................ + ldr q3, [x5, #96] // ............................e........................................................................................................................... + ldr q10, [x5, #176] // .................................e...................................................................................................................... + mul v30.4S, v31.4S, v20.S[0] // ...........................................................................................................................................*............ + ldr q7, [x5, #32] // ....e................................................................................................................................................... + str q9, [x1, #-32] // ................................................................................................................................................*....... + add x1, x1, #64 // ......................................................................................................................................................*. + ldr q12, [x5, #144] // ...............................e........................................................................................................................ + sqrdmulh v0.4S, v19.4S, v20.S[1] // .......................................................................................................................................*................ + // gap // ........................................................................................................................................................ + ldr q22, [x5, #128] // ..............................e......................................................................................................................... + ld4 {v25.4S, v26.4S, v27.4S, v28.4S}, [x1] // e....................................................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q29, [x5, #48] // .....e.................................................................................................................................................. + mul v1.4S, v15.4S, v20.S[0] // .................................................................................................................................*...................... + ldr q20, [x5, #80] // .......e................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q18, [x5, #16] // ...e.................................................................................................................................................... + mls v30.4S, v4.4S, v8.S[0] // .............................................................................................................................................*.......... + // gap // ........................................................................................................................................................ + sub v9.4S, v27.4S, v28.4S // .............e.......................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v27.4S, v27.4S, v28.4S // ..............e......................................................................................................................................... + mls v1.4S, v11.4S, v8.S[0] // ...................................................................................................................................*.................... + // gap // ........................................................................................................................................................ + add v19.4S, v25.4S, v26.4S // .........e.............................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v26.4S, v25.4S, v26.4S // ........e............................................................................................................................................... + mul v6.4S, v9.4S, v24.4S // ...............e........................................................................................................................................ + // gap // ........................................................................................................................................................ + str q30, [x2, #-16] // .....................................................................................................................................................*.. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v5.4S, v9.4S, v20.4S // ................e....................................................................................................................................... + add v31.4S, v19.4S, v27.4S // ...................e.................................................................................................................................... + // gap // ........................................................................................................................................................ + str q1, [x2, #-48] // ...................................................................................................................................................*.... + sub v25.4S, v19.4S, v27.4S // ..................e..................................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v15.4S, v26.4S, v29.4S // ...........e............................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v2.4S, v26.4S, v7.4S // ..........e............................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q14, [x5], #(12*16) // ..e..................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v6.4S, v5.4S, v8.S[0] // .................e...................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v2.4S, v15.4S, v8.S[0] // ............e........................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v23.4S, v0.4S, v8.S[0] // ........................................................................................................................................*............... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v15.4S, v25.4S, v18.4S // .....................e.................................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v16.4S, v2.4S, v6.4S // .......................e................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v13.4S, v25.4S, v14.4S // ....................e................................................................................................................................... + add v28.4S, v2.4S, v6.4S // ........................e............................................................................................................................... + // gap // ........................................................................................................................................................ + str q23, [x2, #-32] // ....................................................................................................................................................*... + add x2, x2, #64 // .......................................................................................................................................................* + // gap // ........................................................................................................................................................ + sqrdmulh v11.4S, v16.4S, v18.4S // ..........................e............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v23.4S, v31.4S, v28.4S // .......................................................e................................................................................................ + ld4 {v4.4S, v5.4S, v6.4S, v7.4S}, [x2] // .e...................................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v13.4S, v15.4S, v8.S[0] // ......................e................................................................................................................................. + trn1 v17.4S, v31.4S, v28.4S // ......................................................e................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v0.4S, v16.4S, v14.4S // .........................e.............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v26.4S, v4.4S, v5.4S // ..................................e..................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v0.4S, v11.4S, v8.S[0] // ...........................e............................................................................................................................ + add v31.4S, v6.4S, v7.4S // ........................................e............................................................................................................... + // gap // ........................................................................................................................................................ + add v24.4S, v4.4S, v5.4S // ...................................e.................................................................................................................... + ldr q11, [x5, #-32] // ................................e....................................................................................................................... + ldr q18, [x5, #-80] // .............................e.......................................................................................................................... + mul v9.4S, v26.4S, v22.4S // ....................................e................................................................................................................... + ldr q5, [x4, #32] // ........................................................................e............................................................................... + // gap // ........................................................................................................................................................ + sub v16.4S, v6.4S, v7.4S // .......................................e................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v26.4S, v12.4S // .....................................e.................................................................................................................. + sub v15.4S, v24.4S, v31.4S // ............................................e........................................................................................................... + // gap // ........................................................................................................................................................ + trn1 v26.4S, v13.4S, v0.4S // ........................................................e............................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v24.4S, v24.4S, v31.4S // .............................................e.......................................................................................................... + sqrdmulh v27.4S, v16.4S, v10.4S // ..........................................e............................................................................................................. + // gap // ........................................................................................................................................................ + trn2 v6.4S, v13.4S, v0.4S // .........................................................e.............................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v25.4S, v15.4S, v18.4S // ...............................................e........................................................................................................ + trn1 v19.2D, v17.2D, v26.2D // ............................................................e........................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v9.4S, v30.4S, v8.S[0] // ......................................e................................................................................................................. + trn1 v28.2D, v23.2D, v6.2D // .............................................................e.......................................................................................... + ldr q7, [x4, #16] // .......................................................................e................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v0.4S, v15.4S, v3.4S // ..............................................e......................................................................................................... + trn2 v15.2D, v23.2D, v6.2D // ...........................................................e............................................................................................ + // gap // ........................................................................................................................................................ + sub v4.4S, v19.4S, v28.4S // ..........................................................................e............................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v23.2D, v17.2D, v26.2D // ..........................................................e............................................................................................. + mls v0.4S, v25.4S, v8.S[0] // ................................................e....................................................................................................... + // gap // ........................................................................................................................................................ + add v13.4S, v19.4S, v28.4S // ...........................................................................e............................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v26.4S, v4.4S, v7.S[3] // .............................................................................e.......................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v1.4S, v23.4S, v15.4S // ................................................................................e....................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v31.4S, v16.4S, v11.4S // .........................................e.............................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v29.4S, v23.4S, v15.4S // ...............................................................................e........................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v31.4S, v27.4S, v8.S[0] // ...........................................e............................................................................................................ + sub v27.4S, v13.4S, v1.4S // ..............................................................................................e......................................................... + ldr q20, [x4], #64 // ......................................................................e................................................................................. + add v25.4S, v13.4S, v1.4S // ...............................................................................................e........................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v6.4S, v29.4S, v5.S[1] // ..................................................................................e..................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v28.4S, v4.4S, v7.S[2] // ............................................................................e........................................................................... + srshr v15.4S, v25.4S, #23 // ..................................................................................................................e..................................... + // gap // ........................................................................................................................................................ + sub v17.4S, v9.4S, v31.4S // .................................................e...................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v13.4S, v29.4S, v5.S[0] // .................................................................................e...................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v13.4S, v6.4S, v8.S[0] // ...................................................................................e.................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v16.4S, v17.4S, v18.4S // ....................................................e................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v10.4S, v9.4S, v31.4S // ..................................................e..................................................................................................... + sqrdmulh v23.4S, v27.4S, v20.S[3] // .................................................................................................e...................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v28.4S, v26.4S, v8.S[0] // ..............................................................................e......................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v30.4S, v24.4S, v10.4S // ..............................................................e......................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v12.4S, v17.4S, v3.4S // ...................................................e.................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v12.4S, v16.4S, v8.S[0] // .....................................................e.................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v22.4S, v28.4S, v13.4S // ...................................................................................................e.................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v25.4S, v15.4S, v8.4S // ...................................................................................................................e.................................... + trn2 v15.4S, v24.4S, v10.4S // ...............................................................e........................................................................................ + // gap // ........................................................................................................................................................ + add v31.4S, v28.4S, v13.4S // ....................................................................................................e................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v1.4S, v27.4S, v20.S[2] // ................................................................................................e....................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v2.4S, v0.4S, v12.4S // ................................................................e....................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v1.4S, v23.4S, v8.S[0] // ..................................................................................................e..................................................... + trn2 v10.4S, v0.4S, v12.4S // .................................................................e...................................................................................... + // gap // ........................................................................................................................................................ + srshr v28.4S, v31.4S, #23 // ....................................................................................................................e................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v11.2D, v30.2D, v2.2D // ..................................................................e..................................................................................... + sqrdmulh v19.4S, v22.4S, v20.S[3] // ......................................................................................................e................................................. + // gap // ........................................................................................................................................................ + trn2 v12.2D, v15.2D, v10.2D // ...................................................................e.................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v14.2D, v30.2D, v2.2D // ....................................................................e................................................................................... + mul v6.4S, v22.4S, v20.S[2] // .....................................................................................................e.................................................. + // gap // ........................................................................................................................................................ + trn1 v2.2D, v15.2D, v10.2D // .....................................................................e.................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v31.4S, v28.4S, v8.4S // .....................................................................................................................e.................................. + sub v13.4S, v11.4S, v12.4S // .........................................................................................e.............................................................. + // gap // ........................................................................................................................................................ + add v23.4S, v11.4S, v12.4S // ..........................................................................................e............................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v6.4S, v19.4S, v8.S[0] // .......................................................................................................e................................................ + sub v17.4S, v14.4S, v2.4S // ....................................................................................e................................................................... + // gap // ........................................................................................................................................................ + add v26.4S, v14.4S, v2.4S // .....................................................................................e.................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v24.4S, v13.4S, v21.S[0] // ...........................................................................................e............................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v19.4S, v17.4S, v5.S[3] // .......................................................................................e................................................................ + add v16.4S, v26.4S, v23.4S // .........................................................................................................e.............................................. + // gap // ........................................................................................................................................................ + sub v18.4S, v26.4S, v23.4S // ........................................................................................................e............................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v23.4S, v13.4S, v21.S[1] // ............................................................................................e........................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + srshr v27.4S, v16.4S, #23 // ......................................................................................................................e................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v17.4S, v17.4S, v5.S[2] // ......................................................................................e................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v17.4S, v19.4S, v8.S[0] // ........................................................................................e............................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v2.4S, v18.4S, v7.S[1] // ...........................................................................................................e............................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v24.4S, v23.4S, v8.S[0] // .............................................................................................e.......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v10.4S, v18.4S, v7.S[0] // ..........................................................................................................e............................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v16.4S, v27.4S, v8.4S // .......................................................................................................................e................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v19.4S, v17.4S, v24.4S // .............................................................................................................e.......................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v10.4S, v2.4S, v8.S[0] // ............................................................................................................e........................................... + add v2.4S, v17.4S, v24.4S // ..............................................................................................................e......................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v14.4S, v19.4S, v7.S[0] // ...............................................................................................................e........................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v12.4S, v25.4S, v16.4S // ..........................................................................................................................e............................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v24.4S, v19.4S, v7.S[1] // ................................................................................................................e....................................... + add v9.4S, v25.4S, v16.4S // ...........................................................................................................................e............................ + // gap // ........................................................................................................................................................ + srshr v16.4S, v2.4S, #23 // ........................................................................................................................e............................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v0.4S, v12.4S, v20.S[1] // .............................................................................................................................e.......................... + sub v19.4S, v1.4S, v10.4S // ....................................................................................................................................e................... + // gap // ........................................................................................................................................................ + str q9, [x1], #(16*4) // ..............................................................................................................................................e......... + add v9.4S, v1.4S, v10.4S // .....................................................................................................................................e.................. + // gap // ........................................................................................................................................................ + mls v2.4S, v16.4S, v8.4S // .........................................................................................................................e.............................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v14.4S, v24.4S, v8.S[0] // .................................................................................................................e...................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v13.4S, v12.4S, v20.S[0] // ............................................................................................................................e........................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v15.4S, v31.4S, v2.4S // ...............................................................................................................................e........................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v2.4S, v31.4S, v2.4S // ................................................................................................................................e....................... + mls v13.4S, v0.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + sub v31.4S, v6.4S, v14.4S // .........................................................................................................................................e.............. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v27.4S, v6.4S, v14.4S // ..........................................................................................................................................e............. + mul v23.4S, v19.4S, v20.S[0] // ......................................................................................................................................e................. + // gap // ........................................................................................................................................................ + str q2, [x1, #-48] // ...............................................................................................................................................e........ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v11.4S, v15.4S, v20.S[1] // ..................................................................................................................................e..................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q13, [x2], #(16*4) // ..................................................................................................................................................e..... + str q27, [x1, #-16] // .................................................................................................................................................e...... + // gap // ........................................................................................................................................................ + sqrdmulh v4.4S, v31.4S, v20.S[1] // ............................................................................................................................................e........... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + + // original source code + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ...........e............................................................................................................................................|..........e............................ + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ..........................................e.............................................................................................................|....................................... + // ldr q0, [x5], #(12*16) // ..............................e.........................................................................................................................|.............................e......... + // ldr q4, [x5, #(-12*16 + 1*16)] // ...............e........................................................................................................................................|..............e........................ + // ldr q1, [x5, #(-12*16 + 2*16)] // .....e..................................................................................................................................................|....e.................................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ............e...........................................................................................................................................|...........e........................... + // ldr q2, [x5, #(-12*16 + 4*16)] // .e......................................................................................................................................................|e...................................... + // ldr q6, [x5, #(-12*16 + 5*16)] // ..............e.........................................................................................................................................|.............e......................... + // sub v24.4s, v9.4s, v10.4s // .....................e..................................................................................................................................|....................e.................. + // add v9.4s, v9.4s, v10.4s // ....................e...................................................................................................................................|...................e................... + // mul v10.4s, v24.4s, v1.4s // .............................e..........................................................................................................................|............................e.......... + // sqrdmulh v24.4s, v24.4s, v5.4s // ............................e...........................................................................................................................|...........................e........... + // mls v10.4s, v24.4s, v8.s[0] // ................................e.......................................................................................................................|...............................e....... + // sub v24.4s, v11.4s, v12.4s // .................e......................................................................................................................................|................e...................... + // add v11.4s, v11.4s, v12.4s // ..................e.....................................................................................................................................|.................e..................... + // mul v12.4s, v24.4s, v2.4s // ......................e.................................................................................................................................|.....................e................. + // sqrdmulh v24.4s, v24.4s, v6.4s // ........................e...............................................................................................................................|.......................e............... + // mls v12.4s, v24.4s, v8.s[0] // ...............................e........................................................................................................................|..............................e........ + // sub v24.4s, v9.4s, v11.4s // ...........................e............................................................................................................................|..........................e............ + // add v9.4s, v9.4s, v11.4s // .........................e..............................................................................................................................|........................e.............. + // mul v11.4s, v24.4s, v0.4s // ....................................e...................................................................................................................|...................................e... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................e.....................................................................................................................|.................................e..... + // mls v11.4s, v24.4s, v8.s[0] // ...........................................e............................................................................................................|....................................... + // sub v24.4s, v10.4s, v12.4s // ...................................e....................................................................................................................|..................................e.... + // add v10.4s, v10.4s, v12.4s // .....................................e..................................................................................................................|....................................e.. + // mul v12.4s, v24.4s, v0.4s // .............................................e..........................................................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ........................................e...............................................................................................................|....................................... + // mls v12.4s, v24.4s, v8.s[0] // ...............................................e........................................................................................................|....................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ..e.....................................................................................................................................................|.e..................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ...................................................e....................................................................................................|....................................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ..........e.............................................................................................................................................|.........e............................. + // ldr q5, [x5, #(-12*16 + 9*16)] // ........e...............................................................................................................................................|.......e............................... + // ldr q2, [x5, #(-12*16 + 10*16)] // ..................................................e.....................................................................................................|....................................... + // ldr q6, [x5, #(-12*16 + 11*16)] // ...e....................................................................................................................................................|..e.................................... + // sub v24.4s, v13.4s, v14.4s // ..............................................e.........................................................................................................|....................................... + // add v13.4s, v13.4s, v14.4s // .................................................e......................................................................................................|....................................... + // mul v14.4s, v24.4s, v1.4s // ....................................................e...................................................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .......................................................e................................................................................................|....................................... + // mls v14.4s, v24.4s, v8.s[0] // ...............................................................e........................................................................................|....................................... + // sub v24.4s, v15.4s, v16.4s // ......................................................e.................................................................................................|....................................... + // add v15.4s, v15.4s, v16.4s // ................................................e.......................................................................................................|....................................... + // mul v16.4s, v24.4s, v2.4s // ..........................................................................e.............................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................................e............................................................................................|....................................... + // mls v16.4s, v24.4s, v8.s[0] // ............................................................................e...........................................................................|....................................... + // sub v24.4s, v13.4s, v15.4s // ........................................................e...............................................................................................|....................................... + // add v13.4s, v13.4s, v15.4s // ..........................................................e.............................................................................................|....................................... + // mul v15.4s, v24.4s, v0.4s // ..................................................................e.....................................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .............................................................e..........................................................................................|....................................... + // mls v15.4s, v24.4s, v8.s[0] // ......................................................................e.................................................................................|....................................... + // sub v24.4s, v14.4s, v16.4s // ...................................................................................e....................................................................|....................................... + // add v14.4s, v14.4s, v16.4s // .......................................................................................e................................................................|....................................... + // mul v16.4s, v24.4s, v0.4s // ...........................................................................................e............................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ......................................................................................e.................................................................|....................................... + // mls v16.4s, v24.4s, v8.s[0] // ............................................................................................e...........................................................|....................................... + // trn1 v25.4s, v9.4s, v10.4s // ............................................e...........................................................................................................|....................................... + // trn2 v26.4s, v9.4s, v10.4s // .........................................e..............................................................................................................|....................................... + // trn1 v27.4s, v11.4s, v12.4s // .........................................................e..............................................................................................|....................................... + // trn2 v28.4s, v11.4s, v12.4s // ............................................................e...........................................................................................|....................................... + // trn2 v11.2d, v25.2d, v27.2d // .....................................................................e..................................................................................|....................................... + // trn2 v12.2d, v26.2d, v28.2d // ...................................................................e....................................................................................|....................................... + // trn1 v9.2d, v25.2d, v27.2d // ..............................................................e.........................................................................................|....................................... + // trn1 v10.2d, v26.2d, v28.2d // ................................................................e.......................................................................................|....................................... + // trn1 v25.4s, v13.4s, v14.4s // ..........................................................................................e.............................................................|....................................... + // trn2 v26.4s, v13.4s, v14.4s // ...............................................................................................e........................................................|....................................... + // trn1 v27.4s, v15.4s, v16.4s // ..................................................................................................e.....................................................|....................................... + // trn2 v28.4s, v15.4s, v16.4s // ....................................................................................................e...................................................|....................................... + // trn2 v15.2d, v25.2d, v27.2d // ......................................................................................................e.................................................|....................................... + // trn2 v16.2d, v26.2d, v28.2d // ........................................................................................................e...............................................|....................................... + // trn1 v13.2d, v25.2d, v27.2d // .........................................................................................................e..............................................|....................................... + // trn1 v14.2d, v26.2d, v28.2d // ...........................................................................................................e............................................|....................................... + // ldr q0, [x4], #64 // ..............................................................................e.........................................................................|....................................... + // ldr q1, [x4, #(-64 + 16)] // .................................................................e......................................................................................|....................................... + // ldr q2, [x4, #(-64 + 32)] // .....................................................e..................................................................................................|....................................... + // ldr q3, [x4, #(-64 + 48)] // e.......................................................................................................................................................e....................................... + // sub v24.4s, v9.4s, v10.4s // ....................................................................e...................................................................................|....................................... + // add v9.4s, v9.4s, v10.4s // .......................................................................e................................................................................|....................................... + // mul v10.4s, v24.4s, v1.s[2] // .................................................................................e......................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ........................................................................e...............................................................................|....................................... + // mls v10.4s, v24.4s, v8.s[0] // .........................................................................................e..............................................................|....................................... + // sub v24.4s, v11.4s, v12.4s // ...........................................................................e............................................................................|....................................... + // add v11.4s, v11.4s, v12.4s // .........................................................................e..............................................................................|....................................... + // mul v12.4s, v24.4s, v2.s[0] // ....................................................................................e...................................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................e.......................................................................|....................................... + // mls v12.4s, v24.4s, v8.s[0] // .....................................................................................e..................................................................|....................................... + // sub v24.4s, v13.4s, v14.4s // ................................................................................................................e.......................................|....................................... + // add v13.4s, v13.4s, v14.4s // .................................................................................................................e......................................|....................................... + // mul v14.4s, v24.4s, v2.s[2] // ........................................................................................................................e...............................|....................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...................................................................................................................e....................................|....................................... + // mls v14.4s, v24.4s, v8.s[0] // .........................................................................................................................e..............................|....................................... + // sub v24.4s, v15.4s, v16.4s // .............................................................................................................e..........................................|....................................... + // add v15.4s, v15.4s, v16.4s // ..............................................................................................................e.........................................|....................................... + // mul v16.4s, v24.4s, v3.s[0] // ..................................................................................................................e.....................................|....................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ......................................................................................................................e.................................|....................................... + // mls v16.4s, v24.4s, v8.s[0] // ...........................................................................................................................e............................|....................................... + // sub v24.4s, v9.4s, v11.4s // .............................................................................e..........................................................................|....................................... + // add v9.4s, v9.4s, v11.4s // ...............................................................................e........................................................................|....................................... + // mul v11.4s, v24.4s, v0.s[2] // .................................................................................................e......................................................|....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ........................................................................................e...............................................................|....................................... + // mls v11.4s, v24.4s, v8.s[0] // ...................................................................................................e....................................................|....................................... + // sub v24.4s, v10.4s, v12.4s // .............................................................................................e..........................................................|....................................... + // add v10.4s, v10.4s, v12.4s // ................................................................................................e.......................................................|....................................... + // mul v12.4s, v24.4s, v0.s[2] // ..........................................................................................................e.............................................|....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................e................................................|....................................... + // mls v12.4s, v24.4s, v8.s[0] // ...............................................................................................................e........................................|....................................... + // sub v24.4s, v13.4s, v15.4s // .....................................................................................................................e..................................|....................................... + // add v13.4s, v13.4s, v15.4s // ....................................................................................................................e...................................|....................................... + // mul v15.4s, v24.4s, v1.s[0] // ............................................................................................................................e...........................|....................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..........................................................................................................................e.............................|....................................... + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................................e........................|....................................... + // sub v24.4s, v14.4s, v16.4s // ..............................................................................................................................e.........................|....................................... + // add v14.4s, v14.4s, v16.4s // ................................................................................................................................e.......................|....................................... + // mul v16.4s, v24.4s, v1.s[0] // .................................................................................................................................e......................|....................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................................................e....................|....................................... + // mls v16.4s, v24.4s, v8.s[0] // ...........................................................................................................................................e............|....................................... + // srshr v24.4S, v9.4S, #23 // ..................................................................................e.....................................................................|....................................... + // mls v9.4s, v24.4s, v8.4s // ..............................................................................................e.........................................................|....................................... + // srshr v24.4S, v10.4S, #23 // .....................................................................................................e..................................................|....................................... + // mls v10.4s, v24.4s, v8.4s // ............................................................................................................e...........................................|....................................... + // srshr v24.4S, v13.4S, #23 // .......................................................................................................................e................................|....................................... + // mls v13.4s, v24.4s, v8.4s // .............................................................................................................................e..........................|....................................... + // srshr v24.4S, v14.4S, #23 // .....................................................................................................................................e..................|....................................... + // mls v14.4s, v24.4s, v8.4s // ..........................................................................................................................................e.............|....................................... + // sub v24.4s, v9.4s, v13.4s // ..................................................................................................................................e.....................|....................................... + // add v9.4s, v9.4s, v13.4s // ....................................................................................................................................e...................|....................................... + // mul v13.4s, v24.4s, v0.s[0] // ............................................................................................................................................e...........|....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................e.................|....................................... + // mls v13.4s, v24.4s, v8.s[0] // ...............................................................................................................................................e........|....................................... + // sub v24.4s, v10.4s, v14.4s // .............................................................................................................................................e..........|....................................... + // add v10.4s, v10.4s, v14.4s // ..............................................................................................................................................e.........|....................................... + // mul v14.4s, v24.4s, v0.s[0] // .............*..........................................................................................................................................|............*.......................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................................................................................................................................................e...|....................................... + // mls v14.4s, v24.4s, v8.s[0] // ...................*....................................................................................................................................|..................*.................... + // sub v24.4s, v11.4s, v15.4s // .......................................................................................................................................e................|....................................... + // add v11.4s, v11.4s, v15.4s // .........................................................................................................................................e..............|....................................... + // mul v15.4s, v24.4s, v0.s[0] // ..................................................................................................................................................e.....|....................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .........*..............................................................................................................................................|........*.............................. + // mls v15.4s, v24.4s, v8.s[0] // .................................*......................................................................................................................|................................*...... + // sub v24.4s, v12.4s, v16.4s // ................................................................................................................................................e.......|....................................... + // add v12.4s, v12.4s, v16.4s // .................................................................................................................................................e......|....................................... + // mul v16.4s, v24.4s, v0.s[0] // ....*...................................................................................................................................................|...*................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................................e|....................................... + // mls v16.4s, v24.4s, v8.s[0] // ................*.......................................................................................................................................|...............*....................... + // str q9, [x1], #(16*4) // ........................................................................................................................................e...............|....................................... + // str q10, [x1, #(-16*4 + 1*16)] // ...................................................................................................................................................e....|....................................... + // str q11, [x1, #(-16*4 + 2*16)] // ......*.................................................................................................................................................|.....*................................. + // str q12, [x1, #(-16*4 + 3*16)] // ......................................................................................................................................................e.|....................................... + // str q13, [x2], #(16*4) // .....................................................................................................................................................e..|....................................... + // str q14, [x2, #(-16*4 + 1*16)] // ..........................*.............................................................................................................................|.........................*............. + // str q15, [x2, #(-16*4 + 2*16)] // ......................................*.................................................................................................................|.....................................*. + // str q16, [x2, #(-16*4 + 3*16)] // .......................*................................................................................................................................|......................*................ + // add x1, x1, #64 // .......*................................................................................................................................................|......*................................ + // add x2, x2, #64 // .......................................*................................................................................................................|......................................* + + sub count, count, #1 + cbnz count, layer45678_start + mul v16.4S, v31.4S, v20.S[0] // *........... + str q9, [x1, #-32] // .*.......... + add x1, x1, #64 // ..*......... + // gap // ............ + // gap // ............ + // gap // ............ + sqrdmulh v2.4S, v19.4S, v20.S[1] // ...*........ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + mul v22.4S, v15.4S, v20.S[0] // ....*....... + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + mls v16.4S, v4.4S, v8.S[0] // .....*...... + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + mls v22.4S, v11.4S, v8.S[0] // ......*..... + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + mls v23.4S, v2.4S, v8.S[0] // .........*.. + // gap // ............ + // gap // ............ + str q16, [x2, #-16] // .......*.... + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + str q22, [x2, #-48] // ........*... + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + // gap // ............ + str q23, [x2, #-32] // ..........*. + add x2, x2, #64 // ...........* + // gap // ............ + + // original source code + // mul v30.4S, v31.4S, v20.S[0] // *........... + // str q9, [x1, #-32] // .*.......... + // add x1, x1, #64 // ..*......... + // sqrdmulh v0.4S, v19.4S, v20.S[1] // ...*........ + // mul v1.4S, v15.4S, v20.S[0] // ....*....... + // mls v30.4S, v4.4S, v8.S[0] // .....*...... + // mls v1.4S, v11.4S, v8.S[0] // ......*..... + // str q30, [x2, #-16] // ........*... + // str q1, [x2, #-48] // .........*.. + // mls v23.4S, v0.4S, v8.S[0] // .......*.... + // str q23, [x2, #-32] // ..........*. + // add x2, x2, #64 // ...........* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q4, [x0, #768] // *............ + ldr q20, [x0, #896] // .....*....... + // gap // ............. + ldr q12, [x0, #256] // .*........... + // gap // ............. + // gap // ............. + ldr q11, [x0, #384] // ..*.......... + // gap // ............. + // gap // ............. + ldr q28, [x0, #512] // ...*......... + // gap // ............. + // gap // ............. + sub v15.4S, v4.4S, v20.4S // ......*...... + ldr q24, [x0, #640] // ....*........ + // gap // ............. + add v18.4S, v4.4S, v20.4S // .......*..... + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + sqrdmulh v9.4S, v15.4S, v3.S[1] // .........*... + // gap // ............. + // gap // ............. + sub v29.4S, v28.4S, v24.4S // ........*.... + // gap // ............. + // gap // ............. + mul v5.4S, v15.4S, v3.S[0] // ..........*.. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + sqrdmulh v20.4S, v29.4S, v2.S[3] // ...........*. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + // gap // ............. + mls v5.4S, v9.4S, v8.S[0] // ............* + // gap // ............. + // gap // ............. + + // original source code + // ldr q13, [x0, #768] // *............ + // ldr q12, [x0, #256] // ..*.......... + // ldr q11, [x0, #384] // ...*......... + // ldr q28, [x0, #512] // ....*........ + // ldr q24, [x0, #640] // ......*...... + // ldr q14, [x0, #896] // .*........... + // sub v6.4S, v13.4S, v14.4S // .....*....... + // add v18.4S, v13.4S, v14.4S // .......*..... + // sub v29.4S, v28.4S, v24.4S // .........*... + // sqrdmulh v19.4S, v6.4S, v3.S[1] // ........*.... + // mul v5.4S, v6.4S, v3.S[0] // ..........*.. + // sqrdmulh v20.4S, v29.4S, v2.S[3] // ...........*. + // mls v5.4S, v19.4S, v8.S[0] // ............* + + sub count, count, #1 +layer123_start: + ldr q16, [x0, #0] // *....................................................................................................................... + ldr q4, [x0, #128] // .*...................................................................................................................... + sub v7.4S, v12.4S, v11.4S // .............*.......................................................................................................... + mul v19.4S, v29.4S, v2.S[2] // ....................*................................................................................................... + add v22.4S, v12.4S, v11.4S // ..............*......................................................................................................... + ldr q13, [x0, #784] // ......e................................................................................................................. + add v21.4S, v28.4S, v24.4S // ...................*.................................................................................................... + ldr q12, [x0, #272] // ..e..................................................................................................................... + ldr q11, [x0, #400] // ...e.................................................................................................................... + mul v17.4S, v7.4S, v2.S[0] // ...............*........................................................................................................ + ldr q28, [x0, #528] // ....e................................................................................................................... + ldr q24, [x0, #656] // .....e.................................................................................................................. + ldr q14, [x0, #912] // .......e................................................................................................................ + sub v29.4S, v16.4S, v4.4S // ........*............................................................................................................... + // gap // ........................................................................................................................ + add v16.4S, v16.4S, v4.4S // .........*.............................................................................................................. + sqrdmulh v4.4S, v7.4S, v2.S[1] // ................*....................................................................................................... + // gap // ........................................................................................................................ + sub v7.4S, v21.4S, v18.4S // ......................................*................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v20.4S, v8.S[0] // ......................*................................................................................................. + add v21.4S, v21.4S, v18.4S // .......................................*................................................................................ + // gap // ........................................................................................................................ + sub v20.4S, v16.4S, v22.4S // ............................*........................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v16.4S, v16.4S, v22.4S // .............................*.......................................................................................... + mul v22.4S, v29.4S, v1.S[2] // ..........*............................................................................................................. + // gap // ........................................................................................................................ + sub v6.4S, v13.4S, v14.4S // .......................e................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v18.4S, v13.4S, v14.4S // ........................e............................................................................................... + mls v17.4S, v4.4S, v8.S[0] // .................*...................................................................................................... + // gap // ........................................................................................................................ + sub v4.4S, v19.4S, v5.4S // ...........................................*............................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v13.4S, v29.4S, v1.S[3] // ...........*............................................................................................................ + add v19.4S, v19.4S, v5.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + sub v14.4S, v16.4S, v21.4S // ................................................*....................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v16.4S, v16.4S, v21.4S // .................................................*...................................................................... + mul v21.4S, v20.4S, v0.S[2] // ..............................*......................................................................................... + // gap // ........................................................................................................................ + sub v29.4S, v28.4S, v24.4S // ..................e..................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v20.4S, v20.4S, v0.S[3] // ...............................*........................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v5.4S, v7.4S, v1.S[0] // ........................................*............................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v7.4S, v1.S[1] // .........................................*.............................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v22.4S, v13.4S, v8.S[0] // ............*........................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v20.4S, v8.S[0] // ................................*....................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v7.4S, v8.S[0] // ..........................................*............................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v7.4S, v22.4S, v17.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v22.4S, v22.4S, v17.4S // ..................................*..................................................................................... + mul v13.4S, v4.4S, v1.S[0] // .............................................*.......................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v17.4S, v7.4S, v0.S[2] // ...................................*.................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v20.4S, v22.4S, v19.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v22.4S, v22.4S, v19.4S // ......................................................*................................................................. + sqrdmulh v7.4S, v7.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + sub v19.4S, v21.4S, v5.4S // ..........................................................*............................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v4.4S, v4.4S, v1.S[1] // ..............................................*......................................................................... + add v21.4S, v21.4S, v5.4S // ...........................................................*............................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v5.4S, v14.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v17.4S, v7.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v4.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v4.4S, v14.4S, v0.S[1] // ...................................................*.................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v7.4S, v20.4S, v0.S[0] // .......................................................*................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v14.4S, v17.4S, v13.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v13.4S, v17.4S, v13.4S // ................................................................*....................................................... + mul v17.4S, v16.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v16.4S, v16.4S, v26.4S // .........................................................................................*.............................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v4.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v4.4S, v20.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v20.4S, v19.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v10.4S, v31.4S, v5.4S // ....................................................................*................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v19.4S, v0.S[1] // .............................................................*.......................................................... + cmge v9.4S, v5.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v7.4S, v4.4S, v8.S[0] // .........................................................*.............................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v4.4S, v10.4S, v9.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v10.4S, v14.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v20.4S, v19.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v19.4S, v31.4S, v7.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v14.4S, v14.4S, v0.S[1] // ..................................................................*..................................................... + cmge v9.4S, v7.4S, v30.4S // .........................................................................*.............................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v4.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v4.4S, v19.4S, v9.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v17.4S, v16.4S, v8.S[0] // ..........................................................................................*............................. + cmge v16.4S, v31.4S, v20.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + cmge v19.4S, v20.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v10.4S, v14.4S, v8.S[0] // ...................................................................*.................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q5, [x0, #512] // ....................................................................................*................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v7.4S, v4.4S, v8.4S // ...........................................................................*............................................ + sub v16.4S, v16.4S, v19.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + cmge v4.4S, v31.4S, v17.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v19.4S, v17.4S, v30.4S // .....................................................................................................*.................. + mul v14.4S, v22.4S, v25.4S // ...........................................................................................*............................ + // gap // ........................................................................................................................ + cmge v5.4S, v31.4S, v10.4S // ................................................................................*....................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v20.4S, v16.4S, v8.4S // ...............................................................................*........................................ + cmge v16.4S, v10.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + str q7, [x0, #640] // .....................................................................................*.................................. + sub v4.4S, v4.4S, v19.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v22.4S, v26.4S // ............................................................................................*........................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v16.4S, v5.4S, v16.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v22.4S, v21.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q20, [x0, #768] // ......................................................................................*................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v21.4S, v26.4S // ...............................................................................................*........................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v14.4S, v7.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v13.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v22.4S, v19.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v19.4S, v31.4S, v14.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v21.4S, v13.4S, v25.4S // .................................................................................................*...................... + cmge v13.4S, v14.4S, v30.4S // .........................................................................................................*.............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v7.4S, v8.S[0] // ...................................................................................................*.................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v7.4S, v19.4S, v13.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v10.4S, v16.4S, v8.4S // ...................................................................................*.................................... + cmge v16.4S, v31.4S, v22.4S // ............................................................................................................*........... + // gap // ........................................................................................................................ + cmge v19.4S, v22.4S, v30.4S // .............................................................................................................*.......... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v17.4S, v4.4S, v8.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v4.4S, v31.4S, v21.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v14.4S, v7.4S, v8.4S // ...........................................................................................................*............ + cmge v7.4S, v21.4S, v30.4S // .................................................................................................................*...... + // gap // ........................................................................................................................ + str q10, [x0, #896] // .......................................................................................*................................ + sub v16.4S, v16.4S, v19.4S // ..............................................................................................................*......... + // gap // ........................................................................................................................ + sqrdmulh v19.4S, v6.4S, v3.S[1] // ..........................e............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q17, [x0], #(16) // ....................................................................................................................*... + sub v4.4S, v4.4S, v7.4S // ..................................................................................................................*..... + // gap // ........................................................................................................................ + mls v22.4S, v16.4S, v8.4S // ...............................................................................................................*........ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q14, [x0, #112] // .....................................................................................................................*.. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v21.4S, v4.4S, v8.4S // ...................................................................................................................*.... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v5.4S, v6.4S, v3.S[0] // .........................e.............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q22, [x0, #240] // ......................................................................................................................*. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v20.4S, v29.4S, v2.S[3] // .....................e.................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q21, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v5.4S, v19.4S, v8.S[0] // ...........................e............................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + + // original source code + // ldr q9, [x0, #0] // ...................................................................................................................*...................................................................................................................... + // ldr q10, [x0, #(1*(1024/8))] // ...................................................................................................................|*..................................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ..e................................................................................................................|......e............................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ...e...............................................................................................................|.......e.............................................................................................................. + // ldr q13, [x0, #(4*(1024/8))] // .....e.............................................................................................................|.........e............................................................................................................ + // ldr q14, [x0, #(5*(1024/8))] // ......e............................................................................................................|..........e........................................................................................................... + // ldr q15, [x0, #(6*(1024/8))] // e..................................................................................................................|....e................................................................................................................. + // ldr q16, [x0, #(7*(1024/8))] // .......e...........................................................................................................|...........e.......................................................................................................... + // sub v24.4s, v9.4s, v10.4s // ........*..........................................................................................................|............*......................................................................................................... + // add v9.4s, v9.4s, v10.4s // .........*.........................................................................................................|.............*........................................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ................*..................................................................................................|....................*................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[3] // .....................*.............................................................................................|.........................*............................................................................................ + // mls v10.4s, v24.4s, v8.s[0] // ..............................*....................................................................................|..................................*................................................................................... + // sub v24.4s, v11.4s, v12.4s // ...................................................................................................................|.*.................................................................................................................... + // add v11.4s, v11.4s, v12.4s // ...................................................................................................................|...*.................................................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ....*..............................................................................................................|........*............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ..........*........................................................................................................|..............*....................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ...................*...............................................................................................|.......................*.............................................................................................. + // sub v24.4s, v13.4s, v14.4s // ..........................e........................................................................................|..............................e....................................................................................... + // add v13.4s, v13.4s, v14.4s // .*.................................................................................................................|.....*................................................................................................................ + // mul v14.4s, v24.4s, v2.s[2] // ...................................................................................................................|..*................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................e..|....................................................................................................................e. + // mls v14.4s, v24.4s, v8.s[0] // ............*......................................................................................................|................*..................................................................................................... + // sub v24.4s, v15.4s, v16.4s // .................e.................................................................................................|.....................e................................................................................................ + // add v15.4s, v15.4s, v16.4s // ..................e................................................................................................|......................e............................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ..............................................................................................................e....|..................................................................................................................e... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................e..........|............................................................................................................e......... + // mls v16.4s, v24.4s, v8.s[0] // ..................................................................................................................e|...................................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ..............*....................................................................................................|..................*................................................................................................... + // add v9.4s, v9.4s, v11.4s // ...............*...................................................................................................|...................*.................................................................................................. + // mul v11.4s, v24.4s, v0.s[2] // .........................*.........................................................................................|.............................*........................................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...........................*.......................................................................................|...............................*...................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ...............................*...................................................................................|...................................*.................................................................................. + // sub v24.4s, v10.4s, v12.4s // .................................*.................................................................................|.....................................*................................................................................ + // add v10.4s, v10.4s, v12.4s // ..................................*................................................................................|......................................*............................................................................... + // mul v12.4s, v24.4s, v0.s[2] // ....................................*..............................................................................|........................................*............................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................*...........................................................................|...........................................*.......................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ............................................*......................................................................|................................................*..................................................................... + // sub v24.4s, v13.4s, v15.4s // ...........*.......................................................................................................|...............*...................................................................................................... + // add v13.4s, v13.4s, v15.4s // .............*.....................................................................................................|.................*.................................................................................................... + // mul v15.4s, v24.4s, v1.s[0] // ............................*......................................................................................|................................*..................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .............................*.....................................................................................|.................................*.................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ................................*..................................................................................|....................................*................................................................................. + // sub v24.4s, v14.4s, v16.4s // ....................*..............................................................................................|........................*............................................................................................. + // add v14.4s, v14.4s, v16.4s // ......................*............................................................................................|..........................*........................................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ...................................*...............................................................................|.......................................*.............................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........................................*.........................................................................|.............................................*........................................................................ + // mls v16.4s, v24.4s, v8.s[0] // .............................................*.....................................................................|.................................................*.................................................................... + // sub v24.4s, v9.4s, v13.4s // .......................*...........................................................................................|...........................*.......................................................................................... + // add v9.4s, v9.4s, v13.4s // ........................*..........................................................................................|............................*......................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ...........................................*.......................................................................|...............................................*...................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..............................................*....................................................................|..................................................*................................................................... + // mls v13.4s, v24.4s, v8.s[0] // ....................................................*..............................................................|........................................................*............................................................. + // sub v24.4s, v10.4s, v14.4s // .....................................*.............................................................................|.........................................*............................................................................ + // add v10.4s, v10.4s, v14.4s // ......................................*............................................................................|..........................................*........................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ...............................................*...................................................................|...................................................*.................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................*.............................................................|.........................................................*............................................................ + // mls v14.4s, v24.4s, v8.s[0] // ..........................................................*........................................................|..............................................................*....................................................... + // sub v24.4s, v11.4s, v15.4s // ........................................*..........................................................................|............................................*......................................................................... + // add v11.4s, v11.4s, v15.4s // ..........................................*........................................................................|..............................................*....................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ......................................................*............................................................|..........................................................*........................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ........................................................*..........................................................|............................................................*......................................................... + // mls v15.4s, v24.4s, v8.s[0] // .............................................................*.....................................................|.................................................................*.................................................... + // sub v24.4s, v12.4s, v16.4s // ................................................*..................................................................|....................................................*................................................................. + // add v12.4s, v12.4s, v16.4s // .................................................*.................................................................|.....................................................*................................................................ + // mul v16.4s, v24.4s, v0.s[0] // ............................................................*......................................................|................................................................*..................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................*...................................................|...................................................................*.................................................. + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................*............................................|..........................................................................*........................................... + // cmge v27.4s, v31.4s, v13.4s // .......................................................*...........................................................|...........................................................*.......................................................... + // cmge v28.4s, v13.4s, v30.4s // .........................................................*.........................................................|.............................................................*........................................................ + // sub v28.4s, v27.4s, v28.4s // ...........................................................*.......................................................|...............................................................*...................................................... + // mls v13.4s, v28.4s, v8.4s // .................................................................*.................................................|.....................................................................*................................................ + // cmge v27.4s, v31.4s, v14.4s // ..............................................................*....................................................|..................................................................*................................................... + // cmge v28.4s, v14.4s, v30.4s // ................................................................*..................................................|....................................................................*................................................. + // sub v28.4s, v27.4s, v28.4s // ..................................................................*................................................|......................................................................*............................................... + // mls v14.4s, v28.4s, v8.4s // ........................................................................*..........................................|............................................................................*......................................... + // cmge v27.4s, v31.4s, v15.4s // ....................................................................*..............................................|........................................................................*............................................. + // cmge v28.4s, v15.4s, v30.4s // .....................................................................*.............................................|.........................................................................*............................................ + // sub v28.4s, v27.4s, v28.4s // .........................................................................*.........................................|.............................................................................*........................................ + // mls v15.4s, v28.4s, v8.4s // ..............................................................................*....................................|..................................................................................*................................... + // cmge v27.4s, v31.4s, v16.4s // .............................................................................*.....................................|.................................................................................*.................................... + // cmge v28.4s, v16.4s, v30.4s // ...............................................................................*...................................|...................................................................................*.................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................................................*...............................|.......................................................................................*.............................. + // mls v16.4s, v28.4s, v8.4s // ...............................................................................................*...................|...................................................................................................*.................. + // str q13, [x0, #(4*(1024/8))] // .......................................................................*...........................................|...........................................................................*.......................................... + // str q14, [x0, #(5*(1024/8))] // ................................................................................*..................................|....................................................................................*................................. + // str q15, [x0, #(6*(1024/8))] // .....................................................................................*.............................|.........................................................................................*............................ + // str q16, [x0, #(7*(1024/8))] // ......................................................................................................*............|..........................................................................................................*........... + // mul v13.4s, v9.4s, v25.4s // ..................................................*................................................................|......................................................*............................................................... + // sqrdmulh v9.4s, v9.4s, v26.4s // ...................................................*...............................................................|.......................................................*.............................................................. + // mls v13.4s, v9.4s, v8.s[0] // ...................................................................*...............................................|.......................................................................*.............................................. + // mul v14.4s, v10.4s, v25.4s // ............................................................................*......................................|................................................................................*..................................... + // sqrdmulh v10.4s, v10.4s, v26.4s // ..................................................................................*................................|......................................................................................*............................... + // mls v14.4s, v10.4s, v8.s[0] // .......................................................................................*...........................|...........................................................................................*.......................... + // mul v15.4s, v11.4s, v25.4s // ....................................................................................*..............................|........................................................................................*............................. + // sqrdmulh v11.4s, v11.4s, v26.4s // ......................................................................................*............................|..........................................................................................*........................... + // mls v15.4s, v11.4s, v8.s[0] // .........................................................................................*.........................|.............................................................................................*........................ + // mul v16.4s, v12.4s, v25.4s // ...........................................................................................*.......................|...............................................................................................*...................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ........................................................................................*..........................|............................................................................................*......................... + // mls v16.4s, v12.4s, v8.s[0] // .............................................................................................*.....................|.................................................................................................*.................... + // cmge v27.4s, v31.4s, v13.4s // ..........................................................................*........................................|..............................................................................*....................................... + // cmge v28.4s, v13.4s, v30.4s // ...........................................................................*.......................................|...............................................................................*...................................... + // sub v28.4s, v27.4s, v28.4s // .................................................................................*.................................|.....................................................................................*................................ + // mls v13.4s, v28.4s, v8.4s // ..................................................................................................*................|......................................................................................................*............... + // cmge v27.4s, v31.4s, v14.4s // ..........................................................................................*........................|..............................................................................................*....................... + // cmge v28.4s, v14.4s, v30.4s // ............................................................................................*......................|................................................................................................*..................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................................*....................|..................................................................................................*................... + // mls v14.4s, v28.4s, v8.4s // ....................................................................................................*..............|........................................................................................................*............. + // cmge v27.4s, v31.4s, v15.4s // ................................................................................................*..................|....................................................................................................*................. + // cmge v28.4s, v15.4s, v30.4s // .................................................................................................*.................|.....................................................................................................*................ + // sub v28.4s, v27.4s, v28.4s // .......................................................................................................*...........|...........................................................................................................*.......... + // mls v15.4s, v28.4s, v8.4s // ...........................................................................................................*.......|...............................................................................................................*...... + // cmge v27.4s, v31.4s, v16.4s // ...................................................................................................*...............|.......................................................................................................*.............. + // cmge v28.4s, v16.4s, v30.4s // .....................................................................................................*.............|.........................................................................................................*............ + // sub v28.4s, v27.4s, v28.4s // ..........................................................................................................*........|..............................................................................................................*....... + // mls v16.4s, v28.4s, v8.4s // .............................................................................................................*.....|.................................................................................................................*.... + // str q13, [x0], #(16) // .........................................................................................................*.........|.............................................................................................................*........ + // str q14, [x0, #(-16 + 1*(1024/8))] // ............................................................................................................*......|................................................................................................................*..... + // str q15, [x0, #(-16 + 2*(1024/8))] // ...............................................................................................................*...|...................................................................................................................*.. + // str q16, [x0, #(-16 + 3*(1024/8))] // .................................................................................................................*.|.....................................................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + sub v22.4S, v12.4S, v11.4S // ..*........................................................................................................ + mul v16.4S, v29.4S, v2.S[2] // ...*....................................................................................................... + ldr q4, [x0, #0] // *.......................................................................................................... + add v7.4S, v12.4S, v11.4S // ....*...................................................................................................... + ldr q19, [x0, #128] // .*......................................................................................................... + // gap // ........................................................................................................... + add v21.4S, v28.4S, v24.4S // .....*..................................................................................................... + mls v16.4S, v20.4S, v8.S[0] // ...........*............................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v12.4S, v22.4S, v2.S[0] // ......*.................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v11.4S, v21.4S, v18.4S // ..........*................................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v21.4S, v21.4S, v18.4S // ............*.............................................................................................. + sqrdmulh v22.4S, v22.4S, v2.S[1] // .........*................................................................................................. + // gap // ........................................................................................................... + sub v13.4S, v16.4S, v5.4S // .................*......................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v16.4S, v16.4S, v5.4S // ...................*....................................................................................... + mul v17.4S, v11.4S, v1.S[0] // ........................*.................................................................................. + // gap // ........................................................................................................... + add v28.4S, v4.4S, v19.4S // ........*.................................................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v4.4S, v4.4S, v19.4S // .......*................................................................................................... + sqrdmulh v19.4S, v11.4S, v1.S[1] // .........................*................................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v12.4S, v22.4S, v8.S[0] // ................*.......................................................................................... + sub v22.4S, v28.4S, v7.4S // .............*............................................................................................. + // gap // ........................................................................................................... + add v7.4S, v28.4S, v7.4S // ..............*............................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v11.4S, v4.4S, v1.S[2] // ...............*........................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v4.4S, v4.4S, v1.S[3] // ..................*........................................................................................ + sub v28.4S, v7.4S, v21.4S // ....................*...................................................................................... + // gap // ........................................................................................................... + add v7.4S, v7.4S, v21.4S // .....................*..................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v21.4S, v22.4S, v0.S[2] // ......................*.................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v22.4S, v22.4S, v0.S[3] // .......................*................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v24.4S, v13.4S, v1.S[0] // ...............................*........................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v13.4S, v13.4S, v1.S[1] // .....................................*..................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v11.4S, v4.4S, v8.S[0] // ..........................*................................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v17.4S, v19.4S, v8.S[0] // ............................*.............................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v21.4S, v22.4S, v8.S[0] // ...........................*............................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + add v4.4S, v11.4S, v12.4S // ..............................*............................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v22.4S, v11.4S, v12.4S // .............................*............................................................................. + mul v19.4S, v28.4S, v0.S[0] // .......................................*................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v24.4S, v13.4S, v8.S[0] // .........................................*................................................................. + sub v12.4S, v4.4S, v16.4S // .................................*......................................................................... + // gap // ........................................................................................................... + add v16.4S, v4.4S, v16.4S // ..................................*........................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v4.4S, v22.4S, v0.S[2] // ................................*.......................................................................... + sub v11.4S, v21.4S, v17.4S // ....................................*...................................................................... + // gap // ........................................................................................................... + add v21.4S, v21.4S, v17.4S // ......................................*.................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v22.4S, v22.4S, v0.S[3] // ...................................*....................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v13.4S, v28.4S, v0.S[1] // ..........................................*................................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v17.4S, v12.4S, v0.S[0] // ...........................................*............................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v4.4S, v22.4S, v8.S[0] // ........................................*.................................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v22.4S, v7.4S, v25.4S // ..............................................*............................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v7.4S, v7.4S, v26.4S // ...............................................*........................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v28.4S, v4.4S, v24.4S // ............................................*.............................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v19.4S, v13.4S, v8.S[0] // ................................................*.......................................................... + add v4.4S, v4.4S, v24.4S // .............................................*............................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v12.4S, v12.4S, v0.S[1] // .................................................*......................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v13.4S, v11.4S, v0.S[0] // ..................................................*........................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v24.4S, v31.4S, v19.4S // ...................................................*....................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v11.4S, v11.4S, v0.S[1] // ....................................................*...................................................... + cmge v14.4S, v19.4S, v30.4S // .....................................................*..................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v17.4S, v12.4S, v8.S[0] // ......................................................*.................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v12.4S, v24.4S, v14.4S // .......................................................*................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v24.4S, v28.4S, v0.S[0] // ........................................................*.................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v13.4S, v11.4S, v8.S[0] // .........................................................*................................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v11.4S, v31.4S, v17.4S // ..........................................................*................................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v28.4S, v28.4S, v0.S[1] // ...........................................................*............................................... + cmge v14.4S, v17.4S, v30.4S // ............................................................*.............................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v19.4S, v12.4S, v8.4S // .............................................................*............................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v12.4S, v11.4S, v14.4S // ..............................................................*............................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v22.4S, v7.4S, v8.S[0] // ...............................................................*........................................... + cmge v7.4S, v31.4S, v13.4S // ................................................................*.......................................... + // gap // ........................................................................................................... + cmge v11.4S, v13.4S, v30.4S // .................................................................*......................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v24.4S, v28.4S, v8.S[0] // ..................................................................*........................................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q19, [x0, #512] // ...................................................................*....................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v17.4S, v12.4S, v8.4S // ....................................................................*...................................... + sub v7.4S, v7.4S, v11.4S // .....................................................................*..................................... + // gap // ........................................................................................................... + cmge v19.4S, v31.4S, v22.4S // ......................................................................*.................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v12.4S, v22.4S, v30.4S // .......................................................................*................................... + mul v11.4S, v16.4S, v25.4S // ........................................................................*.................................. + // gap // ........................................................................................................... + cmge v28.4S, v31.4S, v24.4S // .........................................................................*................................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v16.4S, v16.4S, v26.4S // ..............................................................................*............................ + cmge v14.4S, v24.4S, v30.4S // ...........................................................................*............................... + // gap // ........................................................................................................... + str q17, [x0, #640] // ............................................................................*.............................. + sub v19.4S, v19.4S, v12.4S // .............................................................................*............................. + // gap // ........................................................................................................... + mul v12.4S, v21.4S, v25.4S // ................................................................................*.......................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v17.4S, v28.4S, v14.4S // ...............................................................................*........................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v21.4S, v21.4S, v26.4S // ..................................................................................*........................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v11.4S, v16.4S, v8.S[0] // ...................................................................................*....................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sqrdmulh v16.4S, v4.4S, v26.4S // ....................................................................................*...................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v12.4S, v21.4S, v8.S[0] // .....................................................................................*..................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v21.4S, v31.4S, v11.4S // ......................................................................................*.................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mul v4.4S, v4.4S, v25.4S // .......................................................................................*................... + cmge v28.4S, v11.4S, v30.4S // ........................................................................................*.................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v4.4S, v16.4S, v8.S[0] // .........................................................................................*................. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + sub v16.4S, v21.4S, v28.4S // ..........................................................................................*................ + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v13.4S, v7.4S, v8.4S // ..........................................................................*................................ + cmge v7.4S, v31.4S, v12.4S // ............................................................................................*.............. + // gap // ........................................................................................................... + cmge v21.4S, v12.4S, v30.4S // .............................................................................................*............. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v24.4S, v17.4S, v8.4S // ...........................................................................................*............... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + cmge v17.4S, v31.4S, v4.4S // ...............................................................................................*........... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v22.4S, v19.4S, v8.4S // ..............................................................................................*............ + cmge v19.4S, v4.4S, v30.4S // .................................................................................................*......... + // gap // ........................................................................................................... + str q13, [x0, #768] // .................................................................................*......................... + sub v7.4S, v7.4S, v21.4S // ...................................................................................................*....... + // gap // ........................................................................................................... + mls v11.4S, v16.4S, v8.4S // ................................................................................................*.......... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q24, [x0, #896] // ..................................................................................................*........ + sub v16.4S, v17.4S, v19.4S // .....................................................................................................*..... + // gap // ........................................................................................................... + mls v12.4S, v7.4S, v8.4S // ......................................................................................................*.... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q22, [x0], #(16) // ....................................................................................................*...... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + mls v4.4S, v16.4S, v8.4S // ........................................................................................................*.. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q11, [x0, #112] // .......................................................................................................*... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q12, [x0, #240] // .........................................................................................................*. + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + // gap // ........................................................................................................... + str q4, [x0, #368] // ..........................................................................................................* + // gap // ........................................................................................................... + // gap // ........................................................................................................... + + // original source code + // ldr q16, [x0, #0] // ..*........................................................................................................ + // ldr q4, [x0, #128] // ....*...................................................................................................... + // sub v7.4S, v12.4S, v11.4S // *.......................................................................................................... + // mul v19.4S, v29.4S, v2.S[2] // .*......................................................................................................... + // add v22.4S, v12.4S, v11.4S // ...*....................................................................................................... + // add v21.4S, v28.4S, v24.4S // .....*..................................................................................................... + // mul v17.4S, v7.4S, v2.S[0] // .......*................................................................................................... + // sub v29.4S, v16.4S, v4.4S // ...............*........................................................................................... + // add v16.4S, v16.4S, v4.4S // ..............*............................................................................................ + // sqrdmulh v4.4S, v7.4S, v2.S[1] // ..........*................................................................................................ + // sub v7.4S, v21.4S, v18.4S // ........*.................................................................................................. + // mls v19.4S, v20.4S, v8.S[0] // ......*.................................................................................................... + // add v21.4S, v21.4S, v18.4S // .........*................................................................................................. + // sub v20.4S, v16.4S, v22.4S // ..................*........................................................................................ + // add v16.4S, v16.4S, v22.4S // ...................*....................................................................................... + // mul v22.4S, v29.4S, v1.S[2] // ....................*...................................................................................... + // mls v17.4S, v4.4S, v8.S[0] // .................*......................................................................................... + // sub v4.4S, v19.4S, v5.4S // ...........*............................................................................................... + // sqrdmulh v13.4S, v29.4S, v1.S[3] // .....................*..................................................................................... + // add v19.4S, v19.4S, v5.4S // ............*.............................................................................................. + // sub v14.4S, v16.4S, v21.4S // ......................*.................................................................................... + // add v16.4S, v16.4S, v21.4S // .......................*................................................................................... + // mul v21.4S, v20.4S, v0.S[2] // ........................*.................................................................................. + // sqrdmulh v20.4S, v20.4S, v0.S[3] // .........................*................................................................................. + // mul v5.4S, v7.4S, v1.S[0] // .............*............................................................................................. + // sqrdmulh v7.4S, v7.4S, v1.S[1] // ................*.......................................................................................... + // mls v22.4S, v13.4S, v8.S[0] // ............................*.............................................................................. + // mls v21.4S, v20.4S, v8.S[0] // ..............................*............................................................................ + // mls v5.4S, v7.4S, v8.S[0] // .............................*............................................................................. + // sub v7.4S, v22.4S, v17.4S // ................................*.......................................................................... + // add v22.4S, v22.4S, v17.4S // ...............................*........................................................................... + // mul v13.4S, v4.4S, v1.S[0] // ..........................*................................................................................ + // mul v17.4S, v7.4S, v0.S[2] // .....................................*..................................................................... + // sub v20.4S, v22.4S, v19.4S // ...................................*....................................................................... + // add v22.4S, v22.4S, v19.4S // ....................................*...................................................................... + // sqrdmulh v7.4S, v7.4S, v0.S[3] // ........................................*.................................................................. + // sub v19.4S, v21.4S, v5.4S // ......................................*.................................................................... + // sqrdmulh v4.4S, v4.4S, v1.S[1] // ...........................*............................................................................... + // add v21.4S, v21.4S, v5.4S // .......................................*................................................................... + // mul v5.4S, v14.4S, v0.S[0] // .................................*......................................................................... + // mls v17.4S, v7.4S, v8.S[0] // ...........................................*............................................................... + // mls v13.4S, v4.4S, v8.S[0] // ..................................*........................................................................ + // sqrdmulh v4.4S, v14.4S, v0.S[1] // .........................................*................................................................. + // mul v7.4S, v20.4S, v0.S[0] // ..........................................*................................................................ + // sub v14.4S, v17.4S, v13.4S // ..............................................*............................................................ + // add v13.4S, v17.4S, v13.4S // ................................................*.......................................................... + // mul v17.4S, v16.4S, v25.4S // ............................................*.............................................................. + // sqrdmulh v16.4S, v16.4S, v26.4S // .............................................*............................................................. + // mls v5.4S, v4.4S, v8.S[0] // ...............................................*........................................................... + // sqrdmulh v4.4S, v20.4S, v0.S[1] // .................................................*......................................................... + // mul v20.4S, v19.4S, v0.S[0] // ..................................................*........................................................ + // cmge v10.4S, v31.4S, v5.4S // ...................................................*....................................................... + // sqrdmulh v19.4S, v19.4S, v0.S[1] // ....................................................*...................................................... + // cmge v9.4S, v5.4S, v30.4S // .....................................................*..................................................... + // mls v7.4S, v4.4S, v8.S[0] // ......................................................*.................................................... + // sub v4.4S, v10.4S, v9.4S // .......................................................*................................................... + // mul v10.4S, v14.4S, v0.S[0] // ........................................................*.................................................. + // mls v20.4S, v19.4S, v8.S[0] // .........................................................*................................................. + // cmge v19.4S, v31.4S, v7.4S // ..........................................................*................................................ + // sqrdmulh v14.4S, v14.4S, v0.S[1] // ...........................................................*............................................... + // cmge v9.4S, v7.4S, v30.4S // ............................................................*.............................................. + // mls v5.4S, v4.4S, v8.4S // .............................................................*............................................. + // sub v4.4S, v19.4S, v9.4S // ..............................................................*............................................ + // mls v17.4S, v16.4S, v8.S[0] // ...............................................................*........................................... + // cmge v16.4S, v31.4S, v20.4S // ................................................................*.......................................... + // cmge v19.4S, v20.4S, v30.4S // .................................................................*......................................... + // mls v10.4S, v14.4S, v8.S[0] // ..................................................................*........................................ + // str q5, [x0, #512] // ...................................................................*....................................... + // mls v7.4S, v4.4S, v8.4S // ....................................................................*...................................... + // sub v16.4S, v16.4S, v19.4S // .....................................................................*..................................... + // cmge v4.4S, v31.4S, v17.4S // ......................................................................*.................................... + // cmge v19.4S, v17.4S, v30.4S // .......................................................................*................................... + // mul v14.4S, v22.4S, v25.4S // ........................................................................*.................................. + // cmge v5.4S, v31.4S, v10.4S // .........................................................................*................................. + // mls v20.4S, v16.4S, v8.4S // .........................................................................................*................. + // cmge v16.4S, v10.4S, v30.4S // ...........................................................................*............................... + // str q7, [x0, #640] // ............................................................................*.............................. + // sub v4.4S, v4.4S, v19.4S // .............................................................................*............................. + // sqrdmulh v7.4S, v22.4S, v26.4S // ..........................................................................*................................ + // sub v16.4S, v5.4S, v16.4S // ...............................................................................*........................... + // mul v22.4S, v21.4S, v25.4S // ..............................................................................*............................ + // str q20, [x0, #768] // ................................................................................................*.......... + // sqrdmulh v19.4S, v21.4S, v26.4S // ................................................................................*.......................... + // mls v14.4S, v7.4S, v8.S[0] // .................................................................................*......................... + // sqrdmulh v7.4S, v13.4S, v26.4S // ..................................................................................*........................ + // mls v22.4S, v19.4S, v8.S[0] // ...................................................................................*....................... + // cmge v19.4S, v31.4S, v14.4S // ....................................................................................*...................... + // mul v21.4S, v13.4S, v25.4S // .....................................................................................*..................... + // cmge v13.4S, v14.4S, v30.4S // ......................................................................................*.................... + // mls v21.4S, v7.4S, v8.S[0] // .......................................................................................*................... + // sub v7.4S, v19.4S, v13.4S // ........................................................................................*.................. + // mls v10.4S, v16.4S, v8.4S // ............................................................................................*.............. + // cmge v16.4S, v31.4S, v22.4S // ..........................................................................................*................ + // cmge v19.4S, v22.4S, v30.4S // ...........................................................................................*............... + // mls v17.4S, v4.4S, v8.4S // ..............................................................................................*............ + // cmge v4.4S, v31.4S, v21.4S // .............................................................................................*............. + // mls v14.4S, v7.4S, v8.4S // ..................................................................................................*........ + // cmge v7.4S, v21.4S, v30.4S // ...............................................................................................*........... + // str q10, [x0, #896] // ...................................................................................................*....... + // sub v16.4S, v16.4S, v19.4S // .................................................................................................*......... + // str q17, [x0], #(16) // ......................................................................................................*.... + // sub v4.4S, v4.4S, v7.4S // ....................................................................................................*...... + // mls v22.4S, v16.4S, v8.4S // .....................................................................................................*..... + // str q14, [x0, #112] // ........................................................................................................*.. + // mls v21.4S, v4.4S, v8.4S // .......................................................................................................*... + // str q22, [x0, #240] // .........................................................................................................*. + // str q21, [x0, #368] // ..........................................................................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_firestorm.s b/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_firestorm.s new file mode 100644 index 00000000..4d589126 --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_firestorm.s @@ -0,0 +1,2335 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, modulus +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + vmul \dst, \src, \const + vqrdmulh \src, \src, \const_twisted + vmls \dst, \src, modulus +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_opt_m1_firestorm + .global _intt_dilithium_123_45678_opt_m1_firestorm + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_opt_m1_firestorm: +_intt_dilithium_123_45678_opt_m1_firestorm: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + modulus .req v29 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + ld4 {v19.4S, v20.4S, v21.4S, v22.4S}, [x1] // ..*........................................................................................................................................... + ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x2] // ...........*.................................................................................................................................. + ldr q29, [x5, #32] // .*............................................................................................................................................ + ldr q5, [x5, #80] // ...*.......................................................................................................................................... + ldr q0, [x5], #(12*16) // *............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + ldr q2, [x5, #-128] // ....*......................................................................................................................................... + ldr q15, [x5, #-48] // ......*....................................................................................................................................... + ldr q13, [x4], #64 // .....*........................................................................................................................................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + ldr q18, [x5, #-16] // .......*...................................................................................................................................... + ldr q27, [x5, #-64] // ........*..................................................................................................................................... + ldr q17, [x5, #-176] // .........*.................................................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + ldr q31, [x5, #-144] // ..........*................................................................................................................................... + ldr q25, [x5, #-32] // ....................*......................................................................................................................... + ldr q28, [x5, #-96] // ......................*....................................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + ldr q3, [x5, #-80] // ..............................*............................................................................................................... + ldr q1, [x4, #-16] // ...................................................*.......................................................................................... + ldr q4, [x4, #-48] // ...................................*.......................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + ldr q16, [x4, #-32] // ...................................................................*.......................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v6.4S, v19.4S, v20.4S // ..............*............................................................................................................................... + add v19.4S, v19.4S, v20.4S // ............*................................................................................................................................. + sub v20.4S, v21.4S, v22.4S // .............*................................................................................................................................ + add v22.4S, v21.4S, v22.4S // ...............*.............................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v23.4S, v9.4S, v10.4S // ........................*..................................................................................................................... + add v9.4S, v9.4S, v10.4S // .......................*...................................................................................................................... + sub v21.4S, v11.4S, v12.4S // .........................*.................................................................................................................... + add v12.4S, v11.4S, v12.4S // ..........................*................................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v29.4S, v6.4S, v29.4S // ................*............................................................................................................................. + sqrdmulh v5.4S, v20.4S, v5.4S // ..................*........................................................................................................................... + mul v2.4S, v20.4S, v2.4S // .................*............................................................................................................................ + sqrdmulh v31.4S, v6.4S, v31.4S // ...................*.......................................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sqrdmulh v15.4S, v23.4S, v15.4S // ...............................*.............................................................................................................. + sqrdmulh v18.4S, v21.4S, v18.4S // .................................*............................................................................................................ + mul v27.4S, v23.4S, v27.4S // ..................................*........................................................................................................... + mul v25.4S, v21.4S, v25.4S // ................................*............................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v20.4S, v19.4S, v22.4S // .....................*........................................................................................................................ + add v19.4S, v19.4S, v22.4S // ....................................*......................................................................................................... + sub v22.4S, v9.4S, v12.4S // ......................................*....................................................................................................... + add v9.4S, v9.4S, v12.4S // .................................................*............................................................................................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v29.4S, v31.4S, v8.S[0] // ...........................*.................................................................................................................. + mls v2.4S, v5.4S, v8.S[0] // ............................*................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v27.4S, v15.4S, v8.S[0] // ..........................................*................................................................................................... + mls v25.4S, v18.4S, v8.S[0] // ...........................................*.................................................................................................. + mul v5.4S, v20.4S, v0.4S // .............................*................................................................................................................ + sqrdmulh v12.4S, v20.4S, v17.4S // .....................................*........................................................................................................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sqrdmulh v15.4S, v22.4S, v3.4S // .........................................*.................................................................................................... + mul v18.4S, v22.4S, v28.4S // ............................................*................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v31.4S, v29.4S, v2.4S // ........................................*..................................................................................................... + add v29.4S, v29.4S, v2.4S // .......................................*...................................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v2.4S, v27.4S, v25.4S // ....................................................*......................................................................................... + add v27.4S, v27.4S, v25.4S // ......................................................*....................................................................................... + mls v5.4S, v12.4S, v8.S[0] // ..................................................*........................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v0.4S, v31.4S, v0.4S // ...............................................*.............................................................................................. + sqrdmulh v12.4S, v31.4S, v17.4S // ................................................*............................................................................................. + trn1 v17.4S, v19.4S, v29.4S // .............................................*................................................................................................ + trn2 v29.4S, v19.4S, v29.4S // ..............................................*............................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v19.4S, v2.4S, v28.4S // ..........................................................*................................................................................... + sqrdmulh v2.4S, v2.4S, v3.4S // ...........................................................*.................................................................................. + mls v18.4S, v15.4S, v8.S[0] // .....................................................*........................................................................................ + trn1 v15.4S, v9.4S, v27.4S // ........................................................*..................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + trn2 v9.4S, v9.4S, v27.4S // .........................................................*.................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v0.4S, v12.4S, v8.S[0] // .......................................................*...................................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v19.4S, v2.4S, v8.S[0] // ..............................................................*............................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + trn1 v2.4S, v5.4S, v0.4S // ............................................................*................................................................................. + trn2 v5.4S, v5.4S, v0.4S // .............................................................*................................................................................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + trn1 v0.4S, v18.4S, v19.4S // .....................................................................*........................................................................ + trn2 v19.4S, v18.4S, v19.4S // ......................................................................*....................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + trn1 v12.2D, v17.2D, v2.2D // ...............................................................*.............................................................................. + trn2 v2.2D, v17.2D, v2.2D // .................................................................*............................................................................ + trn1 v18.2D, v29.2D, v5.2D // ................................................................*............................................................................. + trn2 v29.2D, v29.2D, v5.2D // ..................................................................*........................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + trn2 v5.2D, v15.2D, v0.2D // ........................................................................*..................................................................... + trn1 v0.2D, v15.2D, v0.2D // .........................................................................*.................................................................... + trn2 v15.2D, v9.2D, v19.2D // ...........................................................................*.................................................................. + trn1 v19.2D, v9.2D, v19.2D // ..........................................................................*................................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v9.4S, v12.4S, v18.4S // ....................................................................*......................................................................... + add v12.4S, v12.4S, v18.4S // .............................................................................*................................................................ + sub v18.4S, v2.4S, v29.4S // .......................................................................*...................................................................... + add v29.4S, v2.4S, v29.4S // ...............................................................................*.............................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v2.4S, v5.4S, v15.4S // .................................................................................*............................................................ + add v5.4S, v5.4S, v15.4S // ....................................................................................*......................................................... + sub v15.4S, v0.4S, v19.4S // ................................................................................*............................................................. + add v19.4S, v0.4S, v19.4S // ..................................................................................*........................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sqrdmulh v0.4S, v9.4S, v4.S[3] // ............................................................................*................................................................. + mul v9.4S, v9.4S, v4.S[2] // ...................................................................................*.......................................................... + mul v27.4S, v18.4S, v16.S[0] // ..............................................................................*............................................................... + sqrdmulh v18.4S, v18.4S, v16.S[1] // .....................................................................................*........................................................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v17.4S, v2.4S, v1.S[0] // ........................................................................................*..................................................... + sqrdmulh v2.4S, v2.4S, v1.S[1] // .........................................................................................*.................................................... + sqrdmulh v31.4S, v15.4S, v16.S[3] // ......................................................................................*....................................................... + mul v15.4S, v15.4S, v16.S[2] // .......................................................................................*...................................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v25.4S, v12.4S, v29.4S // ...............................................................................................*.............................................. + add v29.4S, v12.4S, v29.4S // ..........................................................................................*................................................... + sub v12.4S, v19.4S, v5.4S // ............................................................................................*................................................. + add v19.4S, v19.4S, v5.4S // ...........................................................................................*.................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v9.4S, v0.4S, v8.S[0] // .............................................................................................*................................................ + mls v27.4S, v18.4S, v8.S[0] // ..............................................................................................*............................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v17.4S, v2.4S, v8.S[0] // ...................................................................................................*.......................................... + mls v15.4S, v31.4S, v8.S[0] // ..................................................................................................*........................................... + srshr v5.4S, v29.4S, #23 // .................................................................................................*............................................ + srshr v0.4S, v19.4S, #23 // ................................................................................................*............................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v2.4S, v25.4S, v13.S[2] // .....................................................................................................*........................................ + sqrdmulh v18.4S, v25.4S, v13.S[3] // .............................................................................................................*................................ + sqrdmulh v31.4S, v12.4S, v4.S[1] // ....................................................................................................*......................................... + mul v12.4S, v12.4S, v4.S[0] // ........................................................................................................*..................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v25.4S, v9.4S, v27.4S // ......................................................................................................*....................................... + add v9.4S, v9.4S, v27.4S // .......................................................................................................*...................................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v29.4S, v5.4S, v8.4S // ..........................................................................................................*................................... + mls v19.4S, v0.4S, v8.4S // .........................................................................................................*.................................... + sub v5.4S, v15.4S, v17.4S // ............................................................................................................*................................. + add v0.4S, v15.4S, v17.4S // ...........................................................................................................*.................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v12.4S, v31.4S, v8.S[0] // .....................................................................................................................*........................ + mul v15.4S, v25.4S, v13.S[2] // ..............................................................................................................*............................... + sqrdmulh v27.4S, v25.4S, v13.S[3] // ...............................................................................................................*.............................. + srshr v17.4S, v9.4S, #23 // ................................................................................................................*............................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sqrdmulh v31.4S, v5.4S, v4.S[1] // .................................................................................................................*............................ + mul v5.4S, v5.4S, v4.S[0] // ..................................................................................................................*........................... + mls v2.4S, v18.4S, v8.S[0] // ......................................................................................................................*....................... + srshr v18.4S, v0.4S, #23 // ...................................................................................................................*.......................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + add v25.4S, v29.4S, v19.4S // ........................................................................................................................*..................... + sub v29.4S, v29.4S, v19.4S // ....................................................................................................................*......................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v9.4S, v17.4S, v8.4S // .......................................................................................................................*...................... + mls v15.4S, v27.4S, v8.S[0] // .........................................................................................................................*.................... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mls v0.4S, v18.4S, v8.4S // ..........................................................................................................................*................... + mls v5.4S, v31.4S, v8.S[0] // ............................................................................................................................*................. + str q25, [x1], #(16*4) // ..............................................................................................................................*............... + mul v19.4S, v29.4S, v13.S[0] // ...........................................................................................................................*.................. + sqrdmulh v29.4S, v29.4S, v13.S[1] // .............................................................................................................................*................ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v18.4S, v2.4S, v12.4S // ...............................................................................................................................*.............. + add v26.4S, v2.4S, v12.4S // .......................................................................................................................................*...... + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + sub v2.4S, v9.4S, v0.4S // ................................................................................................................................*............. + add v24.4S, v9.4S, v0.4S // ..................................................................................................................................*........... + mls v19.4S, v29.4S, v8.S[0] // ...................................................................................................................................*.......... + sub v29.4S, v15.4S, v5.4S // .................................................................................................................................*............ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + add v5.4S, v15.4S, v5.4S // ......................................................................................................................................*....... + mul v11.4S, v18.4S, v13.S[0] // ....................................................................................................................................*......... + sqrdmulh v10.4S, v18.4S, v13.S[1] // .....................................................................................................................................*........ + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + mul v6.4S, v2.4S, v13.S[0] // ........................................................................................................................................*..... + sqrdmulh v23.4S, v2.4S, v13.S[1] // .........................................................................................................................................*.... + mul v12.4S, v29.4S, v13.S[0] // ..........................................................................................................................................*... + sqrdmulh v14.4S, v29.4S, v13.S[1] // ...........................................................................................................................................*.. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + str q19, [x2], #(16*4) // ............................................................................................................................................*. + str q5, [x1, #-16] // .............................................................................................................................................* + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + // gap // .............................................................................................................................................. + + // original source code + // ldr q9, [x5], #(12*16) // ....*......................................................................................................................................... + // ldr q25, [x5, #-160] // ..*........................................................................................................................................... + // ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // *............................................................................................................................................. + // ldr q26, [x5, #-112] // ...*.......................................................................................................................................... + // ldr q0, [x5, #-128] // .....*........................................................................................................................................ + // ldr q31, [x4], #64 // .......*...................................................................................................................................... + // ldr q23, [x5, #-48] // ......*....................................................................................................................................... + // ldr q7, [x5, #-16] // ........*..................................................................................................................................... + // ldr q2, [x5, #-64] // .........*.................................................................................................................................... + // ldr q6, [x5, #-176] // ..........*................................................................................................................................... + // ldr q1, [x5, #-144] // ...........*.................................................................................................................................. + // ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2] // .*............................................................................................................................................ + // add v4.4S, v18.4S, v19.4S // ...................*.......................................................................................................................... + // sub v24.4S, v20.4S, v21.4S // ....................*......................................................................................................................... + // sub v11.4S, v18.4S, v19.4S // ..................*........................................................................................................................... + // add v12.4S, v20.4S, v21.4S // .....................*........................................................................................................................ + // mul v27.4S, v11.4S, v25.4S // ..........................*................................................................................................................... + // mul v22.4S, v24.4S, v0.4S // ............................*................................................................................................................. + // sqrdmulh v5.4S, v24.4S, v26.4S // ...........................*.................................................................................................................. + // sqrdmulh v13.4S, v11.4S, v1.4S // .............................*................................................................................................................ + // ldr q26, [x5, #-32] // ............*................................................................................................................................. + // sub v0.4S, v4.4S, v12.4S // ..................................*........................................................................................................... + // ldr q25, [x5, #-96] // .............*................................................................................................................................ + // add v20.4S, v14.4S, v15.4S // .......................*...................................................................................................................... + // sub v3.4S, v14.4S, v15.4S // ......................*....................................................................................................................... + // sub v10.4S, v16.4S, v17.4S // ........................*..................................................................................................................... + // add v24.4S, v16.4S, v17.4S // .........................*.................................................................................................................... + // mls v27.4S, v13.4S, v8.S[0] // ......................................*....................................................................................................... + // mls v22.4S, v5.4S, v8.S[0] // .......................................*...................................................................................................... + // mul v21.4S, v0.4S, v9.4S // ..........................................*................................................................................................... + // ldr q16, [x5, #-80] // ..............*............................................................................................................................... + // sqrdmulh v30.4S, v3.4S, v23.4S // ..............................*............................................................................................................... + // mul v1.4S, v10.4S, v26.4S // .................................*............................................................................................................ + // sqrdmulh v23.4S, v10.4S, v7.4S // ...............................*.............................................................................................................. + // mul v26.4S, v3.4S, v2.4S // ................................*............................................................................................................. + // ldr q3, [x4, #-48] // ................*............................................................................................................................. + // add v2.4S, v4.4S, v12.4S // ...................................*.......................................................................................................... + // sqrdmulh v19.4S, v0.4S, v6.4S // ...........................................*.................................................................................................. + // sub v14.4S, v20.4S, v24.4S // ....................................*......................................................................................................... + // add v10.4S, v27.4S, v22.4S // ...............................................*.............................................................................................. + // sub v7.4S, v27.4S, v22.4S // ..............................................*............................................................................................... + // sqrdmulh v13.4S, v14.4S, v16.4S // ............................................*................................................................................................. + // mls v26.4S, v30.4S, v8.S[0] // ........................................*..................................................................................................... + // mls v1.4S, v23.4S, v8.S[0] // .........................................*.................................................................................................... + // mul v5.4S, v14.4S, v25.4S // .............................................*................................................................................................ + // trn1 v11.4S, v2.4S, v10.4S // .....................................................*........................................................................................ + // trn2 v23.4S, v2.4S, v10.4S // ......................................................*....................................................................................... + // mul v29.4S, v7.4S, v9.4S // ...................................................*.......................................................................................... + // sqrdmulh v10.4S, v7.4S, v6.4S // ....................................................*......................................................................................... + // add v30.4S, v20.4S, v24.4S // .....................................*........................................................................................................ + // mls v21.4S, v19.4S, v8.S[0] // ..................................................*........................................................................................... + // ldr q24, [x4, #-16] // ...............*.............................................................................................................................. + // sub v28.4S, v26.4S, v1.4S // ................................................*............................................................................................. + // mls v5.4S, v13.4S, v8.S[0] // .........................................................*.................................................................................... + // add v14.4S, v26.4S, v1.4S // .................................................*............................................................................................ + // mls v29.4S, v10.4S, v8.S[0] // ............................................................*................................................................................. + // trn1 v6.4S, v30.4S, v14.4S // ..........................................................*................................................................................... + // trn2 v30.4S, v30.4S, v14.4S // ...........................................................*.................................................................................. + // mul v14.4S, v28.4S, v25.4S // .......................................................*...................................................................................... + // sqrdmulh v1.4S, v28.4S, v16.4S // ........................................................*..................................................................................... + // trn1 v19.4S, v21.4S, v29.4S // ..............................................................*............................................................................... + // trn2 v26.4S, v21.4S, v29.4S // ...............................................................*.............................................................................. + // mls v14.4S, v1.4S, v8.S[0] // .............................................................*................................................................................ + // trn1 v16.2D, v11.2D, v19.2D // ..................................................................*........................................................................... + // trn1 v20.2D, v23.2D, v26.2D // ....................................................................*......................................................................... + // trn2 v4.2D, v11.2D, v19.2D // ...................................................................*.......................................................................... + // trn2 v0.2D, v23.2D, v26.2D // .....................................................................*........................................................................ + // ldr q29, [x4, #-32] // .................*............................................................................................................................ + // sub v21.4S, v16.4S, v20.4S // ..........................................................................*................................................................... + // trn1 v10.4S, v5.4S, v14.4S // ................................................................*............................................................................. + // trn2 v14.4S, v5.4S, v14.4S // .................................................................*............................................................................ + // sub v7.4S, v4.4S, v0.4S // ............................................................................*................................................................. + // trn2 v23.2D, v6.2D, v10.2D // ......................................................................*....................................................................... + // trn1 v17.2D, v6.2D, v10.2D // .......................................................................*...................................................................... + // trn1 v10.2D, v30.2D, v14.2D // .........................................................................*.................................................................... + // trn2 v11.2D, v30.2D, v14.2D // ........................................................................*..................................................................... + // sqrdmulh v1.4S, v21.4S, v3.S[3] // ..................................................................................*........................................................... + // add v30.4S, v16.4S, v20.4S // ...........................................................................*.................................................................. + // mul v20.4S, v7.4S, v29.S[0] // ....................................................................................*......................................................... + // add v16.4S, v4.4S, v0.4S // .............................................................................*................................................................ + // sub v26.4S, v17.4S, v10.4S // ................................................................................*............................................................. + // sub v19.4S, v23.4S, v11.4S // ..............................................................................*............................................................... + // add v28.4S, v17.4S, v10.4S // .................................................................................*............................................................ + // mul v4.4S, v21.4S, v3.S[2] // ...................................................................................*.......................................................... + // add v14.4S, v23.4S, v11.4S // ...............................................................................*.............................................................. + // sqrdmulh v6.4S, v7.4S, v29.S[1] // .....................................................................................*........................................................ + // sqrdmulh v21.4S, v26.4S, v29.S[3] // ........................................................................................*..................................................... + // mul v11.4S, v26.4S, v29.S[2] // .........................................................................................*.................................................... + // mul v26.4S, v19.4S, v24.S[0] // ......................................................................................*....................................................... + // sqrdmulh v24.4S, v19.4S, v24.S[1] // .......................................................................................*...................................................... + // add v7.4S, v30.4S, v16.4S // ...........................................................................................*.................................................. + // add v23.4S, v28.4S, v14.4S // .............................................................................................*................................................ + // sub v10.4S, v28.4S, v14.4S // ............................................................................................*................................................. + // mls v4.4S, v1.4S, v8.S[0] // ..............................................................................................*............................................... + // mls v20.4S, v6.4S, v8.S[0] // ...............................................................................................*.............................................. + // sub v19.4S, v30.4S, v16.4S // ..........................................................................................*................................................... + // srshr v14.4S, v23.4S, #23 // ...................................................................................................*.......................................... + // srshr v30.4S, v7.4S, #23 // ..................................................................................................*........................................... + // mls v11.4S, v21.4S, v8.S[0] // .................................................................................................*............................................ + // mls v26.4S, v24.4S, v8.S[0] // ................................................................................................*............................................. + // sqrdmulh v5.4S, v10.4S, v3.S[1] // ......................................................................................................*....................................... + // mul v16.4S, v19.4S, v31.S[2] // ....................................................................................................*......................................... + // sub v9.4S, v4.4S, v20.4S // ........................................................................................................*..................................... + // add v1.4S, v4.4S, v20.4S // .........................................................................................................*.................................... + // mul v20.4S, v10.4S, v3.S[0] // .......................................................................................................*...................................... + // mls v23.4S, v14.4S, v8.4S // ...........................................................................................................*.................................. + // mls v7.4S, v30.4S, v8.4S // ..........................................................................................................*................................... + // add v21.4S, v11.4S, v26.4S // .............................................................................................................*................................ + // sub v22.4S, v11.4S, v26.4S // ............................................................................................................*................................. + // sqrdmulh v14.4S, v19.4S, v31.S[3] // .....................................................................................................*........................................ + // mul v4.4S, v9.4S, v31.S[2] // ...............................................................................................................*.............................. + // sqrdmulh v26.4S, v9.4S, v31.S[3] // ................................................................................................................*............................. + // srshr v24.4S, v1.4S, #23 // .................................................................................................................*............................ + // sqrdmulh v30.4S, v22.4S, v3.S[1] // ..................................................................................................................*........................... + // mul v15.4S, v22.4S, v3.S[0] // ...................................................................................................................*.......................... + // srshr v9.4S, v21.4S, #23 // .....................................................................................................................*........................ + // sub v10.4S, v7.4S, v23.4S // .......................................................................................................................*...................... + // mls v20.4S, v5.4S, v8.S[0] // ..............................................................................................................*............................... + // mls v16.4S, v14.4S, v8.S[0] // ....................................................................................................................*......................... + // mls v1.4S, v24.4S, v8.4S // ........................................................................................................................*..................... + // add v14.4S, v7.4S, v23.4S // ......................................................................................................................*....................... + // mls v4.4S, v26.4S, v8.S[0] // .........................................................................................................................*.................... + // mls v21.4S, v9.4S, v8.4S // ..........................................................................................................................*................... + // mul v7.4S, v10.4S, v31.S[0] // .............................................................................................................................*................ + // mls v15.4S, v30.4S, v8.S[0] // ...........................................................................................................................*.................. + // sqrdmulh v26.4S, v10.4S, v31.S[1] // ..............................................................................................................................*............... + // str q14, [x1], #(16*4) // ............................................................................................................................*................. + // sub v10.4S, v16.4S, v20.4S // ...............................................................................................................................*.............. + // sub v30.4S, v1.4S, v21.4S // .................................................................................................................................*............ + // sub v14.4S, v4.4S, v15.4S // ....................................................................................................................................*......... + // add v24.4S, v1.4S, v21.4S // ..................................................................................................................................*........... + // mls v7.4S, v26.4S, v8.S[0] // ...................................................................................................................................*.......... + // mul v11.4S, v10.4S, v31.S[0] // ......................................................................................................................................*....... + // sqrdmulh v10.4S, v10.4S, v31.S[1] // .......................................................................................................................................*...... + // add v2.4S, v4.4S, v15.4S // .....................................................................................................................................*........ + // add v26.4S, v16.4S, v20.4S // ................................................................................................................................*............. + // mul v6.4S, v30.4S, v31.S[0] // ........................................................................................................................................*..... + // sqrdmulh v23.4S, v30.4S, v31.S[1] // .........................................................................................................................................*.... + // mul v12.4S, v14.4S, v31.S[0] // ..........................................................................................................................................*... + // sqrdmulh v14.4S, v14.4S, v31.S[1] // ...........................................................................................................................................*.. + // str q7, [x2], #(16*4) // ............................................................................................................................................*. + // str q2, [x1, #-16] // .............................................................................................................................................* + + sub count, count, #1 +layer45678_start: + ldr q9, [x5], #(12*16) // ..e..................................................................................................................................................... + ldr q25, [x5, #-160] // ....e................................................................................................................................................... + mls v11.4S, v10.4S, v8.S[0] // ........................................................................................................................................*............... + str q26, [x1, #-32] // ................................................................................................................................................*....... + str q24, [x1, #-48] // ...............................................................................................................................................*........ + add x1, x1, #64 // ......................................................................................................................................................*. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ld4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x1] // e....................................................................................................................................................... + ldr q26, [x5, #-112] // .......e................................................................................................................................................ + ldr q0, [x5, #-128] // ......e................................................................................................................................................. + mls v12.4S, v14.4S, v8.S[0] // .............................................................................................................................................*.......... + mls v6.4S, v23.4S, v8.S[0] // ...................................................................................................................................*.................... + ldr q31, [x4], #64 // ......................................................................e................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q23, [x5, #-48] // ...............................e........................................................................................................................ + ldr q7, [x5, #-16] // .................................e...................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q11, [x2, #-32] // ....................................................................................................................................................*... + ldr q2, [x5, #-64] // ..............................e......................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q12, [x2, #-16] // .....................................................................................................................................................*.. + str q6, [x2, #-48] // ...................................................................................................................................................*.... + ldr q6, [x5, #-176] // ...e.................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add x2, x2, #64 // .......................................................................................................................................................* + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q1, [x5, #-144] // .....e.................................................................................................................................................. + // gap // ........................................................................................................................................................ + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x2] // .e...................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v4.4S, v18.4S, v19.4S // .........e.............................................................................................................................................. + sub v24.4S, v20.4S, v21.4S // .............e.......................................................................................................................................... + sub v11.4S, v18.4S, v19.4S // ........e............................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v12.4S, v20.4S, v21.4S // ..............e......................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v27.4S, v11.4S, v25.4S // ..........e............................................................................................................................................. + mul v22.4S, v24.4S, v0.4S // ...............e........................................................................................................................................ + sqrdmulh v5.4S, v24.4S, v26.4S // ................e....................................................................................................................................... + sqrdmulh v13.4S, v11.4S, v1.4S // ...........e............................................................................................................................................ + ldr q26, [x5, #-32] // ................................e....................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v0.4S, v4.4S, v12.4S // ..................e..................................................................................................................................... + ldr q25, [x5, #-96] // ............................e........................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v20.4S, v14.4S, v15.4S // ...................................e.................................................................................................................... + sub v3.4S, v14.4S, v15.4S // ..................................e..................................................................................................................... + // gap // ........................................................................................................................................................ + sub v10.4S, v16.4S, v17.4S // .......................................e................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v24.4S, v16.4S, v17.4S // ........................................e............................................................................................................... + mls v27.4S, v13.4S, v8.S[0] // ............e........................................................................................................................................... + mls v22.4S, v5.4S, v8.S[0] // .................e...................................................................................................................................... + mul v21.4S, v0.4S, v9.4S // ....................e................................................................................................................................... + ldr q16, [x5, #-80] // .............................e.......................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v3.4S, v23.4S // .....................................e.................................................................................................................. + mul v1.4S, v10.4S, v26.4S // .........................................e.............................................................................................................. + sqrdmulh v23.4S, v10.4S, v7.4S // ..........................................e............................................................................................................. + // gap // ........................................................................................................................................................ + mul v26.4S, v3.4S, v2.4S // ....................................e................................................................................................................... + ldr q3, [x4, #-48] // .......................................................................e................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v2.4S, v4.4S, v12.4S // ...................e.................................................................................................................................... + sqrdmulh v19.4S, v0.4S, v6.4S // .....................e.................................................................................................................................. + sub v14.4S, v20.4S, v24.4S // ............................................e........................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v10.4S, v27.4S, v22.4S // ........................e............................................................................................................................... + sub v7.4S, v27.4S, v22.4S // .......................e................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v13.4S, v14.4S, v16.4S // ...............................................e........................................................................................................ + mls v26.4S, v30.4S, v8.S[0] // ......................................e................................................................................................................. + mls v1.4S, v23.4S, v8.S[0] // ...........................................e............................................................................................................ + mul v5.4S, v14.4S, v25.4S // ..............................................e......................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v11.4S, v2.4S, v10.4S // ......................................................e................................................................................................. + trn2 v23.4S, v2.4S, v10.4S // .......................................................e................................................................................................ + mul v29.4S, v7.4S, v9.4S // .........................e.............................................................................................................................. + sqrdmulh v10.4S, v7.4S, v6.4S // ..........................e............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v30.4S, v20.4S, v24.4S // .............................................e.......................................................................................................... + mls v21.4S, v19.4S, v8.S[0] // ......................e................................................................................................................................. + ldr q24, [x4, #-16] // .........................................................................e.............................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v28.4S, v26.4S, v1.4S // .................................................e...................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v5.4S, v13.4S, v8.S[0] // ................................................e....................................................................................................... + add v14.4S, v26.4S, v1.4S // ..................................................e..................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v29.4S, v10.4S, v8.S[0] // ...........................e............................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v6.4S, v30.4S, v14.4S // ..............................................................e......................................................................................... + trn2 v30.4S, v30.4S, v14.4S // ...............................................................e........................................................................................ + mul v14.4S, v28.4S, v25.4S // ...................................................e.................................................................................................... + sqrdmulh v1.4S, v28.4S, v16.4S // ....................................................e................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v19.4S, v21.4S, v29.4S // ........................................................e............................................................................................... + trn2 v26.4S, v21.4S, v29.4S // .........................................................e.............................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v14.4S, v1.4S, v8.S[0] // .....................................................e.................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v16.2D, v11.2D, v19.2D // ............................................................e........................................................................................... + trn1 v20.2D, v23.2D, v26.2D // .............................................................e.......................................................................................... + trn2 v4.2D, v11.2D, v19.2D // ..........................................................e............................................................................................. + trn2 v0.2D, v23.2D, v26.2D // ...........................................................e............................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q29, [x4, #-32] // ........................................................................e............................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v21.4S, v16.4S, v20.4S // ..........................................................................e............................................................................. + trn1 v10.4S, v5.4S, v14.4S // ................................................................e....................................................................................... + trn2 v14.4S, v5.4S, v14.4S // .................................................................e...................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v7.4S, v4.4S, v0.4S // ...............................................................................e........................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v23.2D, v6.2D, v10.2D // ..................................................................e..................................................................................... + trn1 v17.2D, v6.2D, v10.2D // ....................................................................e................................................................................... + trn1 v10.2D, v30.2D, v14.2D // .....................................................................e.................................................................................. + trn2 v11.2D, v30.2D, v14.2D // ...................................................................e.................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v1.4S, v21.4S, v3.S[3] // .............................................................................e.......................................................................... + add v30.4S, v16.4S, v20.4S // ...........................................................................e............................................................................ + mul v20.4S, v7.4S, v29.S[0] // .................................................................................e...................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v16.4S, v4.4S, v0.4S // ................................................................................e....................................................................... + sub v26.4S, v17.4S, v10.4S // ....................................................................................e................................................................... + sub v19.4S, v23.4S, v11.4S // .........................................................................................e.............................................................. + add v28.4S, v17.4S, v10.4S // .....................................................................................e.................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v4.4S, v21.4S, v3.S[2] // ............................................................................e........................................................................... + add v14.4S, v23.4S, v11.4S // ..........................................................................................e............................................................. + sqrdmulh v6.4S, v7.4S, v29.S[1] // ..................................................................................e..................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v21.4S, v26.4S, v29.S[3] // .......................................................................................e................................................................ + mul v11.4S, v26.4S, v29.S[2] // ......................................................................................e................................................................. + mul v26.4S, v19.4S, v24.S[0] // ...........................................................................................e............................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v24.4S, v19.4S, v24.S[1] // ............................................................................................e........................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v7.4S, v30.4S, v16.4S // ...............................................................................................e........................................................ + add v23.4S, v28.4S, v14.4S // .........................................................................................................e.............................................. + sub v10.4S, v28.4S, v14.4S // ........................................................................................................e............................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v4.4S, v1.4S, v8.S[0] // ..............................................................................e......................................................................... + mls v20.4S, v6.4S, v8.S[0] // ...................................................................................e.................................................................... + sub v19.4S, v30.4S, v16.4S // ..............................................................................................e......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + srshr v14.4S, v23.4S, #23 // ......................................................................................................................e................................. + srshr v30.4S, v7.4S, #23 // ..................................................................................................................e..................................... + mls v11.4S, v21.4S, v8.S[0] // ........................................................................................e............................................................... + mls v26.4S, v24.4S, v8.S[0] // .............................................................................................e.......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v5.4S, v10.4S, v3.S[1] // ...........................................................................................................e............................................ + mul v16.4S, v19.4S, v31.S[2] // ................................................................................................e....................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v9.4S, v4.4S, v20.4S // ...................................................................................................e.................................................... + add v1.4S, v4.4S, v20.4S // ....................................................................................................e................................................... + mul v20.4S, v10.4S, v3.S[0] // ..........................................................................................................e............................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v23.4S, v14.4S, v8.4S // .......................................................................................................................e................................ + mls v7.4S, v30.4S, v8.4S // ...................................................................................................................e.................................... + add v21.4S, v11.4S, v26.4S // ..............................................................................................................e......................................... + sub v22.4S, v11.4S, v26.4S // .............................................................................................................e.......................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v14.4S, v19.4S, v31.S[3] // .................................................................................................e...................................................... + mul v4.4S, v9.4S, v31.S[2] // .....................................................................................................e.................................................. + sqrdmulh v26.4S, v9.4S, v31.S[3] // ......................................................................................................e................................................. + srshr v24.4S, v1.4S, #23 // ....................................................................................................................e................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v22.4S, v3.S[1] // ................................................................................................................e....................................... + mul v15.4S, v22.4S, v3.S[0] // ...............................................................................................................e........................................ + // gap // ........................................................................................................................................................ + srshr v9.4S, v21.4S, #23 // ........................................................................................................................e............................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v10.4S, v7.4S, v23.4S // ..........................................................................................................................e............................. + mls v20.4S, v5.4S, v8.S[0] // ............................................................................................................e........................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v16.4S, v14.4S, v8.S[0] // ..................................................................................................e..................................................... + mls v1.4S, v24.4S, v8.4S // .....................................................................................................................e.................................. + add v14.4S, v7.4S, v23.4S // ...........................................................................................................................e............................ + mls v4.4S, v26.4S, v8.S[0] // .......................................................................................................e................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v21.4S, v9.4S, v8.4S // .........................................................................................................................e.............................. + mul v7.4S, v10.4S, v31.S[0] // ............................................................................................................................e........................... + mls v15.4S, v30.4S, v8.S[0] // .................................................................................................................e...................................... + sqrdmulh v26.4S, v10.4S, v31.S[1] // .............................................................................................................................e.......................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q14, [x1], #(16*4) // ..............................................................................................................................................e......... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v10.4S, v16.4S, v20.4S // ....................................................................................................................................e................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v30.4S, v1.4S, v21.4S // ...............................................................................................................................e........................ + sub v14.4S, v4.4S, v15.4S // .........................................................................................................................................e.............. + add v24.4S, v1.4S, v21.4S // ................................................................................................................................e....................... + mls v7.4S, v26.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v11.4S, v10.4S, v31.S[0] // ......................................................................................................................................e................. + sqrdmulh v10.4S, v10.4S, v31.S[1] // .......................................................................................................................................e................ + add v2.4S, v4.4S, v15.4S // ..........................................................................................................................................e............. + add v26.4S, v16.4S, v20.4S // .....................................................................................................................................e.................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v6.4S, v30.4S, v31.S[0] // .................................................................................................................................e...................... + sqrdmulh v23.4S, v30.4S, v31.S[1] // ..................................................................................................................................e..................... + mul v12.4S, v14.4S, v31.S[0] // ...........................................................................................................................................e............ + sqrdmulh v14.4S, v14.4S, v31.S[1] // ............................................................................................................................................e........... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q7, [x2], #(16*4) // ..................................................................................................................................................e..... + str q2, [x1, #-16] // .................................................................................................................................................e...... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + + // original source code + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // ......e.................................................................................................................................................|.....e............. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // .....................e..................................................................................................................................|................... + // ldr q0, [x5], #(12*16) // e.......................................................................................................................................................e................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ..................e.....................................................................................................................................|.................e. + // ldr q1, [x5, #(-12*16 + 2*16)] // .e......................................................................................................................................................|e.................. + // ldr q5, [x5, #(-12*16 + 3*16)] // ....................e...................................................................................................................................|................... + // ldr q2, [x5, #(-12*16 + 4*16)] // ........e...............................................................................................................................................|.......e........... + // ldr q6, [x5, #(-12*16 + 5*16)] // .......e................................................................................................................................................|......e............ + // sub v24.4s, v9.4s, v10.4s // ........................e...............................................................................................................................|................... + // add v9.4s, v9.4s, v10.4s // ......................e.................................................................................................................................|................... + // mul v10.4s, v24.4s, v1.4s // ..........................e.............................................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .............................e..........................................................................................................................|................... + // mls v10.4s, v24.4s, v8.s[0] // .....................................e..................................................................................................................|................... + // sub v24.4s, v11.4s, v12.4s // .......................e................................................................................................................................|................... + // add v11.4s, v11.4s, v12.4s // .........................e..............................................................................................................................|................... + // mul v12.4s, v24.4s, v2.4s // ...........................e............................................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ............................e...........................................................................................................................|................... + // mls v12.4s, v24.4s, v8.s[0] // ......................................e.................................................................................................................|................... + // sub v24.4s, v9.4s, v11.4s // ...............................e........................................................................................................................|................... + // add v9.4s, v9.4s, v11.4s // ..............................................e.........................................................................................................|................... + // mul v11.4s, v24.4s, v0.4s // .......................................e................................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...............................................e........................................................................................................|................... + // mls v11.4s, v24.4s, v8.s[0] // ............................................................e...........................................................................................|................... + // sub v24.4s, v10.4s, v12.4s // ..................................................e.....................................................................................................|................... + // add v10.4s, v10.4s, v12.4s // .................................................e......................................................................................................|................... + // mul v12.4s, v24.4s, v0.4s // .........................................................e..............................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..........................................................e.............................................................................................|................... + // mls v12.4s, v24.4s, v8.s[0] // .................................................................e......................................................................................|................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ................................e.......................................................................................................................|................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ........................................e...............................................................................................................|................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ...............e........................................................................................................................................|..............e.... + // ldr q5, [x5, #(-12*16 + 9*16)] // ............e...........................................................................................................................................|...........e....... + // ldr q2, [x5, #(-12*16 + 10*16)] // ..............................e.........................................................................................................................|................... + // ldr q6, [x5, #(-12*16 + 11*16)] // .............e..........................................................................................................................................|............e...... + // sub v24.4s, v13.4s, v14.4s // ..................................e.....................................................................................................................|................... + // add v13.4s, v13.4s, v14.4s // .................................e......................................................................................................................|................... + // mul v14.4s, v24.4s, v1.4s // ............................................e...........................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .........................................e..............................................................................................................|................... + // mls v14.4s, v24.4s, v8.s[0] // ....................................................e...................................................................................................|................... + // sub v24.4s, v15.4s, v16.4s // ...................................e....................................................................................................................|................... + // add v15.4s, v15.4s, v16.4s // ....................................e...................................................................................................................|................... + // mul v16.4s, v24.4s, v2.4s // ..........................................e.............................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................e............................................................................................................|................... + // mls v16.4s, v24.4s, v8.s[0] // .....................................................e..................................................................................................|................... + // sub v24.4s, v13.4s, v15.4s // ................................................e.......................................................................................................|................... + // add v13.4s, v13.4s, v15.4s // ...........................................................e............................................................................................|................... + // mul v15.4s, v24.4s, v0.4s // ......................................................e.................................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................e....................................................................................................|................... + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................e........................................................................................|................... + // sub v24.4s, v14.4s, v16.4s // ..............................................................e.........................................................................................|................... + // add v14.4s, v14.4s, v16.4s // ................................................................e.......................................................................................|................... + // mul v16.4s, v24.4s, v0.4s // ....................................................................e...................................................................................|................... + // sqrdmulh v24.4s, v24.4s, v4.4s // .....................................................................e..................................................................................|................... + // mls v16.4s, v24.4s, v8.s[0] // ........................................................................e...............................................................................|................... + // trn1 v25.4s, v9.4s, v10.4s // .......................................................e................................................................................................|................... + // trn2 v26.4s, v9.4s, v10.4s // ........................................................e...............................................................................................|................... + // trn1 v27.4s, v11.4s, v12.4s // ......................................................................e.................................................................................|................... + // trn2 v28.4s, v11.4s, v12.4s // .......................................................................e................................................................................|................... + // trn2 v11.2d, v25.2d, v27.2d // ...........................................................................e............................................................................|................... + // trn2 v12.2d, v26.2d, v28.2d // ............................................................................e...........................................................................|................... + // trn1 v9.2d, v25.2d, v27.2d // .........................................................................e..............................................................................|................... + // trn1 v10.2d, v26.2d, v28.2d // ..........................................................................e.............................................................................|................... + // trn1 v25.4s, v13.4s, v14.4s // ..................................................................e.....................................................................................|................... + // trn2 v26.4s, v13.4s, v14.4s // ...................................................................e....................................................................................|................... + // trn1 v27.4s, v15.4s, v16.4s // ...............................................................................e........................................................................|................... + // trn2 v28.4s, v15.4s, v16.4s // ................................................................................e.......................................................................|................... + // trn2 v15.2d, v25.2d, v27.2d // ..................................................................................e.....................................................................|................... + // trn2 v16.2d, v26.2d, v28.2d // .....................................................................................e..................................................................|................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................e....................................................................|................... + // trn1 v14.2d, v26.2d, v28.2d // ....................................................................................e...................................................................|................... + // ldr q0, [x4], #64 // ...........e............................................................................................................................................|..........e........ + // ldr q1, [x4, #(-64 + 16)] // .............................................e..........................................................................................................|................... + // ldr q2, [x4, #(-64 + 32)] // .............................................................................e..........................................................................|................... + // ldr q3, [x4, #(-64 + 48)] // .............................................................e..........................................................................................|................... + // sub v24.4s, v9.4s, v10.4s // ..............................................................................e.........................................................................|................... + // add v9.4s, v9.4s, v10.4s // .......................................................................................e................................................................|................... + // mul v10.4s, v24.4s, v1.s[2] // .............................................................................................e..........................................................|................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ......................................................................................e.................................................................|................... + // mls v10.4s, v24.4s, v8.s[0] // .......................................................................................................e................................................|................... + // sub v24.4s, v11.4s, v12.4s // .................................................................................e......................................................................|................... + // add v11.4s, v11.4s, v12.4s // .........................................................................................e..............................................................|................... + // mul v12.4s, v24.4s, v2.s[0] // ........................................................................................e...............................................................|................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ...............................................................................................e........................................................|................... + // mls v12.4s, v24.4s, v8.s[0] // ........................................................................................................e...............................................|................... + // sub v24.4s, v13.4s, v14.4s // ..........................................................................................e.............................................................|................... + // add v13.4s, v13.4s, v14.4s // ............................................................................................e...........................................................|................... + // mul v14.4s, v24.4s, v2.s[2] // .................................................................................................e......................................................|................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................e.......................................................|................... + // mls v14.4s, v24.4s, v8.s[0] // ............................................................................................................e...........................................|................... + // sub v24.4s, v15.4s, v16.4s // ...........................................................................................e............................................................|................... + // add v15.4s, v15.4s, v16.4s // ..............................................................................................e.........................................................|................... + // mul v16.4s, v24.4s, v3.s[0] // ..................................................................................................e.....................................................|................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...................................................................................................e....................................................|................... + // mls v16.4s, v24.4s, v8.s[0] // .............................................................................................................e..........................................|................... + // sub v24.4s, v9.4s, v11.4s // .........................................................................................................e..............................................|................... + // add v9.4s, v9.4s, v11.4s // ....................................................................................................e...................................................|................... + // mul v11.4s, v24.4s, v0.s[2] // ...............................................................................................................e........................................|................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .......................................................................................................................e................................|................... + // mls v11.4s, v24.4s, v8.s[0] // ................................................................................................................................e.......................|................... + // sub v24.4s, v10.4s, v12.4s // ................................................................................................................e.......................................|................... + // add v10.4s, v10.4s, v12.4s // .................................................................................................................e......................................|................... + // mul v12.4s, v24.4s, v0.s[2] // ........................................................................................................................e...............................|................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .........................................................................................................................e..............................|................... + // mls v12.4s, v24.4s, v8.s[0] // ...................................................................................................................................e....................|................... + // sub v24.4s, v13.4s, v15.4s // ......................................................................................................e.................................................|................... + // add v13.4s, v13.4s, v15.4s // .....................................................................................................e..................................................|................... + // mul v15.4s, v24.4s, v1.s[0] // ..................................................................................................................e.....................................|................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ..............................................................................................................e.........................................|................... + // mls v15.4s, v24.4s, v8.s[0] // ...............................................................................................................................e........................|................... + // sub v24.4s, v14.4s, v16.4s // ......................................................................................................................e.................................|................... + // add v14.4s, v14.4s, v16.4s // .....................................................................................................................e..................................|................... + // mul v16.4s, v24.4s, v1.s[0] // ............................................................................................................................e...........................|................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...........................................................................................................................e............................|................... + // mls v16.4s, v24.4s, v8.s[0] // ......................................................................................................................................e.................|................... + // srshr v24.4S, v9.4S, #23 // ...........................................................................................................e............................................|................... + // mls v9.4s, v24.4s, v8.4s // ....................................................................................................................e...................................|................... + // srshr v24.4S, v10.4S, #23 // ..........................................................................................................................e.............................|................... + // mls v10.4s, v24.4s, v8.4s // .................................................................................................................................e......................|................... + // srshr v24.4S, v13.4S, #23 // ..........................................................................................................e.............................................|................... + // mls v13.4s, v24.4s, v8.4s // ...................................................................................................................e....................................|................... + // srshr v24.4S, v14.4S, #23 // .............................................................................................................................e..........................|................... + // mls v14.4s, v24.4s, v8.4s // ....................................................................................................................................e...................|................... + // sub v24.4s, v9.4s, v13.4s // ..............................................................................................................................e.........................|................... + // add v9.4s, v9.4s, v13.4s // ..................................................................................................................................e.....................|................... + // mul v13.4s, v24.4s, v0.s[0] // .....................................................................................................................................e..................|................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .......................................................................................................................................e................|................... + // mls v13.4s, v24.4s, v8.s[0] // .............................................................................................................................................e..........|................... + // sub v24.4s, v10.4s, v14.4s // ..........................................................................................................................................e.............|................... + // add v10.4s, v10.4s, v14.4s // ............................................................................................................................................e...........|................... + // mul v14.4s, v24.4s, v0.s[0] // ..................................................................................................................................................e.....|................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...................................................................................................................................................e....|................... + // mls v14.4s, v24.4s, v8.s[0] // ..........*.............................................................................................................................................|.........*......... + // sub v24.4s, v11.4s, v15.4s // .........................................................................................................................................e..............|................... + // add v11.4s, v11.4s, v15.4s // .................................................................................................................................................e......|................... + // mul v15.4s, v24.4s, v0.s[0] // ..............................................................................................................................................e.........|................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...............................................................................................................................................e........|................... + // mls v15.4s, v24.4s, v8.s[0] // ..*.....................................................................................................................................................|.*................. + // sub v24.4s, v12.4s, v16.4s // ...........................................................................................................................................e............|................... + // add v12.4s, v12.4s, v16.4s // ................................................................................................................................................e.......|................... + // mul v16.4s, v24.4s, v0.s[0] // ....................................................................................................................................................e...|................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................................................................................................................................................e..|................... + // mls v16.4s, v24.4s, v8.s[0] // .........*..............................................................................................................................................|........*.......... + // str q9, [x1], #(16*4) // ........................................................................................................................................e...............|................... + // str q10, [x1, #(-16*4 + 1*16)] // ....*...................................................................................................................................................|...*............... + // str q11, [x1, #(-16*4 + 2*16)] // ...*....................................................................................................................................................|..*................ + // str q12, [x1, #(-16*4 + 3*16)] // .......................................................................................................................................................e|................... + // str q13, [x2], #(16*4) // ......................................................................................................................................................e.|................... + // str q14, [x2, #(-16*4 + 1*16)] // .................*......................................................................................................................................|................*.. + // str q15, [x2, #(-16*4 + 2*16)] // ..............*.........................................................................................................................................|.............*..... + // str q16, [x2, #(-16*4 + 3*16)] // ................*.......................................................................................................................................|...............*... + // add x1, x1, #64 // .....*..................................................................................................................................................|....*.............. + // add x2, x2, #64 // ...................*....................................................................................................................................|..................* + + sub count, count, #1 + cbnz count, layer45678_start + mls v11.4S, v10.4S, v8.S[0] // *......... + mls v12.4S, v14.4S, v8.S[0] // ....*..... + mls v6.4S, v23.4S, v8.S[0] // .....*.... + str q26, [x1, #-32] // .*........ + str q24, [x1, #-48] // ..*....... + add x1, x1, #64 // ...*...... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + str q11, [x2, #-32] // ......*... + str q12, [x2, #-16] // .......*.. + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + str q6, [x2, #-48] // ........*. + add x2, x2, #64 // .........* + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + + // original source code + // mls v11.4S, v10.4S, v8.S[0] // *......... + // str q26, [x1, #-32] // ...*...... + // str q24, [x1, #-48] // ....*..... + // add x1, x1, #64 // .....*.... + // mls v12.4S, v14.4S, v8.S[0] // .*........ + // mls v6.4S, v23.4S, v8.S[0] // ..*....... + // str q11, [x2, #-32] // ......*... + // str q12, [x2, #-16] // .......*.. + // str q6, [x2, #-48] // ........*. + // add x2, x2, #64 // .........* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, modulus.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q12, [x0, #256] // ..*............................................... + ldr q19, [x0, #128] // *................................................. + ldr q5, [x0, #0] // .*................................................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q15, [x0, #512] // ............*..................................... + ldr q9, [x0, #384] // ...*.............................................. + ldr q13, [x0, #896] // ..............*................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q18, [x0, #640] // .............*.................................... + ldr q27, [x0, #768] // ...............*.................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v17.4S, v5.4S, v19.4S // .....*............................................ + add v19.4S, v5.4S, v19.4S // ....*............................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v5.4S, v12.4S, v9.4S // .......*.......................................... + add v9.4S, v12.4S, v9.4S // ......*........................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v12.4S, v15.4S, v18.4S // .................*................................ + add v15.4S, v15.4S, v18.4S // ..................*............................... + add v18.4S, v27.4S, v13.4S // ........................*......................... + sqrdmulh v28.4S, v17.4S, v1.S[3] // ......................*........................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v13.4S, v27.4S, v13.4S // .........................*........................ + mul v27.4S, v17.4S, v1.S[2] // ..........................*....................... + sqrdmulh v17.4S, v5.4S, v2.S[1] // ...................*.............................. + mul v5.4S, v5.4S, v2.S[0] // ....................*............................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v22.4S, v12.4S, v2.S[3] // .....................*............................ + mul v16.4S, v12.4S, v2.S[2] // .......................*.......................... + sub v12.4S, v15.4S, v18.4S // ............................*..................... + add v20.4S, v19.4S, v9.4S // ........*......................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v19.4S, v19.4S, v9.4S // .........*........................................ + add v9.4S, v15.4S, v18.4S // .............................*.................... + mul v15.4S, v13.4S, v3.S[0] // ..............................*................... + sqrdmulh v13.4S, v13.4S, v3.S[1] // ...............................*.................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v27.4S, v28.4S, v8.S[0] // ................................*................. + mls v5.4S, v17.4S, v8.S[0] // ...........................*...................... + mul v4.4S, v12.4S, v1.S[0] // .................................*................ + sqrdmulh v12.4S, v12.4S, v1.S[1] // ...................................*.............. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v16.4S, v22.4S, v8.S[0] // ....................................*............. + mul v17.4S, v19.4S, v0.S[2] // ..........*....................................... + sqrdmulh v19.4S, v19.4S, v0.S[3] // ...........*...................................... + sub v18.4S, v20.4S, v9.4S // ..................................*............... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v15.4S, v13.4S, v8.S[0] // ......................................*........... + add v9.4S, v20.4S, v9.4S // .....................................*............ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v13.4S, v27.4S, v5.4S // .......................................*.......... + mls v4.4S, v12.4S, v8.S[0] // .........................................*........ + sqrdmulh v7.4S, v18.4S, v0.S[1] // ........................................*......... + mul v14.4S, v18.4S, v0.S[0] // ..........................................*....... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v5.4S, v27.4S, v5.4S // .............................................*.... + mls v17.4S, v19.4S, v8.S[0] // ................*................................. + sqrdmulh v23.4S, v9.4S, v26.4S // ...........................................*...... + mul v12.4S, v9.4S, v25.4S // ............................................*..... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v22.4S, v16.4S, v15.4S // .................................................* + sub v15.4S, v16.4S, v15.4S // ................................................*. + mul v28.4S, v13.4S, v0.S[2] // ..............................................*... + sqrdmulh v24.4S, v13.4S, v0.S[3] // ...............................................*.. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + + // original source code + // ldr q16, [x0, #128] // .*................................................ + // ldr q20, [x0, #0] // ..*............................................... + // ldr q6, [x0, #256] // *................................................. + // ldr q23, [x0, #384] // ....*............................................. + // add v27.4S, v20.4S, v16.4S // .........*........................................ + // sub v24.4S, v20.4S, v16.4S // ........*......................................... + // add v22.4S, v6.4S, v23.4S // ...........*...................................... + // sub v16.4S, v6.4S, v23.4S // ..........*....................................... + // add v7.4S, v27.4S, v22.4S // .......................*.......................... + // sub v23.4S, v27.4S, v22.4S // ........................*......................... + // mul v17.4S, v23.4S, v0.S[2] // .................................*................ + // sqrdmulh v23.4S, v23.4S, v0.S[3] // ..................................*............... + // ldr q20, [x0, #512] // ...*.............................................. + // ldr q28, [x0, #640] // ......*........................................... + // ldr q22, [x0, #896] // .....*............................................ + // ldr q27, [x0, #768] // .......*.......................................... + // mls v17.4S, v23.4S, v8.S[0] // ...........................................*...... + // sub v14.4S, v20.4S, v28.4S // ............*..................................... + // add v28.4S, v20.4S, v28.4S // .............*.................................... + // sqrdmulh v11.4S, v16.4S, v2.S[1] // ..................*............................... + // mul v13.4S, v16.4S, v2.S[0] // ...................*.............................. + // sqrdmulh v16.4S, v14.4S, v2.S[3] // ....................*............................. + // sqrdmulh v21.4S, v24.4S, v1.S[3] // ...............*.................................. + // mul v19.4S, v14.4S, v2.S[2] // .....................*............................ + // add v20.4S, v27.4S, v22.4S // ..............*................................... + // sub v10.4S, v27.4S, v22.4S // ................*................................. + // mul v27.4S, v24.4S, v1.S[2] // .................*................................ + // mls v13.4S, v11.4S, v8.S[0] // .............................*.................... + // sub v23.4S, v28.4S, v20.4S // ......................*........................... + // add v20.4S, v28.4S, v20.4S // .........................*........................ + // mul v22.4S, v10.4S, v3.S[0] // ..........................*....................... + // sqrdmulh v10.4S, v10.4S, v3.S[1] // ...........................*...................... + // mls v27.4S, v21.4S, v8.S[0] // ............................*..................... + // mul v4.4S, v23.4S, v1.S[0] // ..............................*................... + // sub v11.4S, v7.4S, v20.4S // ...................................*.............. + // sqrdmulh v14.4S, v23.4S, v1.S[1] // ...............................*.................. + // mls v19.4S, v16.4S, v8.S[0] // ................................*................. + // add v24.4S, v7.4S, v20.4S // .....................................*............ + // mls v22.4S, v10.4S, v8.S[0] // ....................................*............. + // sub v10.4S, v27.4S, v13.4S // ......................................*........... + // sqrdmulh v7.4S, v11.4S, v0.S[1] // ........................................*......... + // mls v4.4S, v14.4S, v8.S[0] // .......................................*.......... + // mul v14.4S, v11.4S, v0.S[0] // .........................................*........ + // sqrdmulh v23.4S, v24.4S, v26.4S // ............................................*..... + // mul v12.4S, v24.4S, v25.4S // .............................................*.... + // add v5.4S, v27.4S, v13.4S // ..........................................*....... + // mul v28.4S, v10.4S, v0.S[2] // ................................................*. + // sqrdmulh v24.4S, v10.4S, v0.S[3] // .................................................* + // sub v15.4S, v19.4S, v22.4S // ...............................................*.. + // add v22.4S, v19.4S, v22.4S // ..............................................*... + + sub count, count, #1 +layer123_start: + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + ldr q16, [x0, #144] // .e...................................................................................................................... + ldr q20, [x0, #16] // e....................................................................................................................... + ldr q6, [x0, #272] // ..e..................................................................................................................... + add v19.4S, v17.4S, v4.4S // ...........................................................*............................................................ + mls v14.4S, v7.4S, v8.S[0] // ....................................................*................................................................... + mls v12.4S, v23.4S, v8.S[0] // ..........................................................................................*............................. + ldr q23, [x0, #400] // ...e.................................................................................................................... + sub v27.4S, v17.4S, v4.4S // ..........................................................*............................................................. + mul v4.4S, v15.4S, v1.S[0] // .............................................*.......................................................................... + sqrdmulh v18.4S, v15.4S, v1.S[1] // ..............................................*......................................................................... + // gap // ........................................................................................................................ + sub v15.4S, v5.4S, v22.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v28.4S, v24.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + add v11.4S, v5.4S, v22.4S // ......................................................*................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v13.4S, v19.4S, v25.4S // ..............................................................................................*......................... + sqrdmulh v10.4S, v19.4S, v26.4S // ...............................................................................................*........................ + mul v5.4S, v27.4S, v0.S[0] // ............................................................*........................................................... + // gap // ........................................................................................................................ + sqrdmulh v21.4S, v27.4S, v0.S[1] // .............................................................*.......................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v15.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + mul v19.4S, v15.4S, v0.S[0] // .......................................................*................................................................ + sqrdmulh v17.4S, v11.4S, v26.4S // ............................................................................................*........................... + mls v4.4S, v18.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v27.4S, v20.4S, v16.4S // .........e.............................................................................................................. + // gap // ........................................................................................................................ + sub v24.4S, v20.4S, v16.4S // ........e............................................................................................................... + mul v15.4S, v11.4S, v25.4S // ...........................................................................................*............................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v22.4S, v6.4S, v23.4S // ..............e......................................................................................................... + // gap // ........................................................................................................................ + sub v16.4S, v6.4S, v23.4S // .............e.......................................................................................................... + mls v13.4S, v10.4S, v8.S[0] // ................................................................................................*....................... + mls v5.4S, v21.4S, v8.S[0] // ..............................................................*......................................................... + cmge v21.4S, v31.4S, v14.4S // ....................................................................*................................................... + cmge v10.4S, v14.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v9.4S, v8.S[0] // .........................................................*.............................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v7.4S, v27.4S, v22.4S // .............................e.......................................................................................... + sub v23.4S, v27.4S, v22.4S // ............................e........................................................................................... + add v27.4S, v28.4S, v4.4S // ................................................................*....................................................... + sub v9.4S, v28.4S, v4.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + mls v15.4S, v17.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v22.4S, v31.4S, v13.4S // ............................................................................................................*........... + sub v10.4S, v21.4S, v10.4S // ......................................................................*................................................. + cmge v17.4S, v13.4S, v30.4S // .............................................................................................................*.......... + cmge v21.4S, v31.4S, v5.4S // ............................................................................*........................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v4.4S, v5.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + cmge v6.4S, v31.4S, v19.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + cmge v18.4S, v19.4S, v30.4S // .........................................................................*.............................................. + sub v22.4S, v22.4S, v17.4S // ..............................................................................................................*......... + mul v17.4S, v23.4S, v0.S[2] // ..............................e......................................................................................... + sqrdmulh v23.4S, v23.4S, v0.S[3] // ...............................e........................................................................................ + ldr q20, [x0, #528] // ....e................................................................................................................... + ldr q28, [x0, #656] // .....e.................................................................................................................. + // gap // ........................................................................................................................ + sqrdmulh v11.4S, v9.4S, v0.S[1] // ..................................................................*..................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v14.4S, v10.4S, v29.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + sub v10.4S, v6.4S, v18.4S // ..........................................................................*............................................. + // gap // ........................................................................................................................ + sub v18.4S, v21.4S, v4.4S // ..............................................................................*......................................... + sqrdmulh v4.4S, v27.4S, v26.4S // ..................................................................................................*..................... + // gap // ........................................................................................................................ + cmge v6.4S, v31.4S, v15.4S // ........................................................................................................*............... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v21.4S, v31.4S, v12.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + mul v9.4S, v9.4S, v0.S[0] // .................................................................*...................................................... + mls v13.4S, v22.4S, v29.4S // ...............................................................................................................*........ + ldr q22, [x0, #912] // .......e................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v19.4S, v10.4S, v29.4S // ...........................................................................*............................................ + mls v5.4S, v18.4S, v29.4S // ...............................................................................*........................................ + mul v18.4S, v27.4S, v25.4S // .................................................................................................*...................... + ldr q27, [x0, #784] // ......e................................................................................................................. + cmge v10.4S, v12.4S, v30.4S // .....................................................................................................*.................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v17.4S, v23.4S, v8.S[0] // ................................e....................................................................................... + // gap // ........................................................................................................................ + cmge v23.4S, v15.4S, v30.4S // .........................................................................................................*.............. + str q14, [x0, #512] // ....................................................................................*................................... + sub v14.4S, v20.4S, v28.4S // ..................e..................................................................................................... + add v28.4S, v20.4S, v28.4S // ...................e.................................................................................................... + mls v9.4S, v11.4S, v8.S[0] // ...................................................................*.................................................... + sqrdmulh v11.4S, v16.4S, v2.S[1] // ................e....................................................................................................... + // gap // ........................................................................................................................ + sub v10.4S, v21.4S, v10.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q13, [x0, #256] // ......................................................................................................................*. + mul v13.4S, v16.4S, v2.S[0] // ...............e........................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q19, [x0, #640] // .....................................................................................*.................................. + sqrdmulh v16.4S, v14.4S, v2.S[3] // .....................e.................................................................................................. + mls v18.4S, v4.4S, v8.S[0] // ...................................................................................................*.................... + sqrdmulh v21.4S, v24.4S, v1.S[3] // ...........e............................................................................................................ + mul v19.4S, v14.4S, v2.S[2] // ....................e................................................................................................... + str q5, [x0, #768] // ......................................................................................*................................. + add v20.4S, v27.4S, v22.4S // ........................e............................................................................................... + // gap // ........................................................................................................................ + mls v12.4S, v10.4S, v29.4S // .......................................................................................................*................ + sub v10.4S, v27.4S, v22.4S // .......................e................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v27.4S, v24.4S, v1.S[2] // ..........e............................................................................................................. + sub v5.4S, v6.4S, v23.4S // ..........................................................................................................*............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v11.4S, v8.S[0] // .................e...................................................................................................... + cmge v6.4S, v9.4S, v30.4S // .................................................................................*...................................... + cmge v22.4S, v31.4S, v9.4S // ................................................................................*....................................... + cmge v14.4S, v18.4S, v30.4S // .................................................................................................................*...... + cmge v4.4S, v31.4S, v18.4S // ................................................................................................................*....... + // gap // ........................................................................................................................ + sub v23.4S, v28.4S, v20.4S // ......................................e................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v20.4S, v28.4S, v20.4S // .......................................e................................................................................ + sub v6.4S, v22.4S, v6.4S // ..................................................................................*..................................... + // gap // ........................................................................................................................ + str q12, [x0], #(16) // ....................................................................................................................*... + mul v22.4S, v10.4S, v3.S[0] // .........................e.............................................................................................. + sqrdmulh v10.4S, v10.4S, v3.S[1] // ..........................e............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v27.4S, v21.4S, v8.S[0] // ............e........................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v12.4S, v4.4S, v14.4S // ..................................................................................................................*..... + mul v4.4S, v23.4S, v1.S[0] // ........................................e............................................................................... + sub v11.4S, v7.4S, v20.4S // ................................................e....................................................................... + sqrdmulh v14.4S, v23.4S, v1.S[1] // .........................................e.............................................................................. + // gap // ........................................................................................................................ + mls v15.4S, v5.4S, v29.4S // ...........................................................................................................*............ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v9.4S, v6.4S, v29.4S // ...................................................................................*.................................... + mls v19.4S, v16.4S, v8.S[0] // ......................e................................................................................................. + add v24.4S, v7.4S, v20.4S // .................................................e...................................................................... + // gap // ........................................................................................................................ + mls v18.4S, v12.4S, v29.4S // ...................................................................................................................*.... + mls v22.4S, v10.4S, v8.S[0] // ...........................e............................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v10.4S, v27.4S, v13.4S // .................................e...................................................................................... + sqrdmulh v7.4S, v11.4S, v0.S[1] // ...................................................e.................................................................... + // gap // ........................................................................................................................ + mls v4.4S, v14.4S, v8.S[0] // ..........................................e............................................................................. + mul v14.4S, v11.4S, v0.S[0] // ..................................................e..................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v23.4S, v24.4S, v26.4S // .........................................................................................e.............................. + mul v12.4S, v24.4S, v25.4S // ........................................................................................e............................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v5.4S, v27.4S, v13.4S // ..................................e..................................................................................... + str q15, [x0, #112] // .....................................................................................................................*.. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v28.4S, v10.4S, v0.S[2] // ...................................e.................................................................................... + sqrdmulh v24.4S, v10.4S, v0.S[3] // ....................................e................................................................................... + str q18, [x0, #368] // .......................................................................................................................* + // gap // ........................................................................................................................ + str q9, [x0, #880] // .......................................................................................*................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v15.4S, v19.4S, v22.4S // ...........................................e............................................................................ + add v22.4S, v19.4S, v22.4S // ............................................e........................................................................... + + // original source code + // ldr q9, [x0, #0] // .e......................................................................................................................|e.................................................................................................................... + // ldr q10, [x0, #(1*(1024/8))] // e.......................................................................................................................e..................................................................................................................... + // ldr q11, [x0, #(2*(1024/8))] // ..e.....................................................................................................................|.e................................................................................................................... + // ldr q12, [x0, #(3*(1024/8))] // ......e.................................................................................................................|.....e............................................................................................................... + // ldr q13, [x0, #(4*(1024/8))] // ..............................................e.........................................................................|.............................................e....................................................................... + // ldr q14, [x0, #(5*(1024/8))] // ...............................................e........................................................................|..............................................e...................................................................... + // ldr q15, [x0, #(6*(1024/8))] // .............................................................e..........................................................|............................................................e........................................................ + // ldr q16, [x0, #(7*(1024/8))] // .........................................................e..............................................................|........................................................e............................................................ + // sub v24.4s, v9.4s, v10.4s // ......................e.................................................................................................|.....................e............................................................................................... + // add v9.4s, v9.4s, v10.4s // .....................e..................................................................................................|....................e................................................................................................ + // mul v10.4s, v24.4s, v1.s[2] // ..................................................................................e.....................................|.................................................................................e................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ............................................................................e...........................................|...........................................................................e......................................... + // mls v10.4s, v24.4s, v8.s[0] // ...............................................................................................e........................|..............................................................................................e...................... + // sub v24.4s, v11.4s, v12.4s // .........................e..............................................................................................|........................e............................................................................................ + // add v11.4s, v11.4s, v12.4s // ........................e...............................................................................................|.......................e............................................................................................. + // mul v12.4s, v24.4s, v2.s[0] // ........................................................................e...............................................|.......................................................................e............................................. + // sqrdmulh v24.4s, v24.4s, v2.s[1] // .....................................................................e..................................................|....................................................................e................................................ + // mls v12.4s, v24.4s, v8.s[0] // ....................................................................................e...................................|...................................................................................e................................. + // sub v24.4s, v13.4s, v14.4s // ..................................................................e.....................................................|.................................................................e................................................... + // add v13.4s, v13.4s, v14.4s // ...................................................................e....................................................|..................................................................e.................................................. + // mul v14.4s, v24.4s, v2.s[2] // .............................................................................e..........................................|............................................................................e........................................ + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ..........................................................................e.............................................|.........................................................................e........................................... + // mls v14.4s, v24.4s, v8.s[0] // ......................................................................................................e.................|.....................................................................................................e............... + // sub v24.4s, v15.4s, v16.4s // .................................................................................e......................................|................................................................................e.................................... + // add v15.4s, v15.4s, v16.4s // ...............................................................................e........................................|..............................................................................e...................................... + // mul v16.4s, v24.4s, v3.s[0] // .............................................................................................e..........................|............................................................................................e........................ + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ..............................................................................................e.........................|.............................................................................................e....................... + // mls v16.4s, v24.4s, v8.s[0] // .........................................................................................................e..............|........................................................................................................e............ + // sub v24.4s, v9.4s, v11.4s // ................................e.......................................................................................|...............................e..................................................................................... + // add v9.4s, v9.4s, v11.4s // ...............................e........................................................................................|..............................e...................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ............................................e...........................................................................|...........................................e......................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............................................e..........................................................................|............................................e........................................................................ + // mls v11.4s, v24.4s, v8.s[0] // ...............................................................e........................................................|..............................................................e...................................................... + // sub v24.4s, v10.4s, v12.4s // ..........................................................................................................e.............|.........................................................................................................e........... + // add v10.4s, v10.4s, v12.4s // ................................................................................................................e.......|...............................................................................................................e..... + // mul v12.4s, v24.4s, v0.s[2] // ..................................................................................................................e.....|.................................................................................................................e... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ...................................................................................................................e....|..................................................................................................................e.. + // mls v12.4s, v24.4s, v8.s[0] // ...........*............................................................................................................|..........*.......................................................................................................... + // sub v24.4s, v13.4s, v15.4s // .........................................................................................e..............................|........................................................................................e............................ + // add v13.4s, v13.4s, v15.4s // ..........................................................................................e.............................|.........................................................................................e........................... + // mul v15.4s, v24.4s, v1.s[0] // .................................................................................................e......................|................................................................................................e.................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...................................................................................................e....................|..................................................................................................e.................. + // mls v15.4s, v24.4s, v8.s[0] // ............................................................................................................e...........|...........................................................................................................e......... + // sub v24.4s, v14.4s, v16.4s // ......................................................................................................................e.|..................................................................................................................... + // add v14.4s, v14.4s, v16.4s // .......................................................................................................................e|..................................................................................................................... + // mul v16.4s, v24.4s, v1.s[0] // ........*...............................................................................................................|.......*............................................................................................................. + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .........*..............................................................................................................|........*............................................................................................................ + // mls v16.4s, v24.4s, v8.s[0] // ....................*...................................................................................................|...................*................................................................................................. + // sub v24.4s, v9.4s, v13.4s // ..................................................................................................e.....................|.................................................................................................e................... + // add v9.4s, v9.4s, v13.4s // .......................................................................................................e................|......................................................................................................e.............. + // mul v13.4s, v24.4s, v0.s[0] // .............................................................................................................e..........|............................................................................................................e........ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ...........................................................................................................e............|..........................................................................................................e.......... + // mls v13.4s, v24.4s, v8.s[0] // ....*...................................................................................................................|...*................................................................................................................. + // sub v24.4s, v10.4s, v14.4s // ..........*.............................................................................................................|.........*........................................................................................................... + // add v10.4s, v10.4s, v14.4s // ............*...........................................................................................................|...........*......................................................................................................... + // mul v14.4s, v24.4s, v0.s[0] // ..................*.....................................................................................................|.................*................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................*......................................................................................................|................*.................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ..............................*.........................................................................................|.............................*....................................................................................... + // sub v24.4s, v11.4s, v15.4s // .......*................................................................................................................|......*.............................................................................................................. + // add v11.4s, v11.4s, v15.4s // ...*....................................................................................................................|..*.................................................................................................................. + // mul v15.4s, v24.4s, v0.s[0] // ...............*........................................................................................................|..............*...................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................*.......................................................................................................|...............*..................................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ...........................*............................................................................................|..........................*.......................................................................................... + // sub v24.4s, v12.4s, v16.4s // ..................................*.....................................................................................|.................................*................................................................................... + // add v12.4s, v12.4s, v16.4s // .................................*......................................................................................|................................*.................................................................................... + // mul v16.4s, v24.4s, v0.s[0] // .......................................................*................................................................|......................................................*.............................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ................................................*.......................................................................|...............................................*..................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ....................................................................*...................................................|...................................................................*................................................. + // cmge v27.4s, v31.4s, v13.4s // ............................*...........................................................................................|...........................*......................................................................................... + // cmge v28.4s, v13.4s, v30.4s // .............................*..........................................................................................|............................*........................................................................................ + // sub v28.4s, v27.4s, v28.4s // .....................................*..................................................................................|....................................*................................................................................ + // mls v13.4s, v28.4s, v29.4s // .................................................*......................................................................|................................................*.................................................................... + // cmge v27.4s, v31.4s, v14.4s // .........................................*..............................................................................|........................................*............................................................................ + // cmge v28.4s, v14.4s, v30.4s // ..........................................*.............................................................................|.........................................*........................................................................... + // sub v28.4s, v27.4s, v28.4s // ..................................................*.....................................................................|.................................................*................................................................... + // mls v14.4s, v28.4s, v29.4s // ..........................................................*.............................................................|.........................................................*........................................................... + // cmge v27.4s, v31.4s, v15.4s // .......................................*................................................................................|......................................*.............................................................................. + // cmge v28.4s, v15.4s, v30.4s // ........................................*...............................................................................|.......................................*............................................................................. + // sub v28.4s, v27.4s, v28.4s // ...................................................*....................................................................|..................................................*.................................................................. + // mls v15.4s, v28.4s, v29.4s // ...........................................................*............................................................|..........................................................*.......................................................... + // cmge v27.4s, v31.4s, v16.4s // ......................................................................................*.................................|.....................................................................................*............................... + // cmge v28.4s, v16.4s, v30.4s // .....................................................................................*..................................|....................................................................................*................................ + // sub v28.4s, v27.4s, v28.4s // ...........................................................................................*............................|..........................................................................................*.......................... + // mls v16.4s, v28.4s, v29.4s // .....................................................................................................*..................|....................................................................................................*................ + // str q13, [x0, #(4*(1024/8))] // .................................................................*......................................................|................................................................*.................................................... + // str q14, [x0, #(5*(1024/8))] // .........................................................................*..............................................|........................................................................*............................................ + // str q15, [x0, #(6*(1024/8))] // ..............................................................................*.........................................|.............................................................................*....................................... + // str q16, [x0, #(7*(1024/8))] // .....................................................................................................................*..|....................................................................................................................* + // mul v13.4s, v9.4s, v25.4s // ...............................................................................................................e........|..............................................................................................................e...... + // sqrdmulh v9.4s, v9.4s, v26.4s // ..............................................................................................................e.........|.............................................................................................................e....... + // mls v13.4s, v9.4s, v8.s[0] // .....*..................................................................................................................|....*................................................................................................................ + // mul v14.4s, v10.4s, v25.4s // .......................*................................................................................................|......................*.............................................................................................. + // sqrdmulh v10.4s, v10.4s, v26.4s // ...................*....................................................................................................|..................*.................................................................................................. + // mls v14.4s, v10.4s, v8.s[0] // ...................................*....................................................................................|..................................*.................................................................................. + // mul v15.4s, v11.4s, v25.4s // .............*..........................................................................................................|............*........................................................................................................ + // sqrdmulh v11.4s, v11.4s, v26.4s // ..............*.........................................................................................................|.............*....................................................................................................... + // mls v15.4s, v11.4s, v8.s[0] // ..........................*.............................................................................................|.........................*........................................................................................... + // mul v16.4s, v12.4s, v25.4s // ............................................................*...........................................................|...........................................................*......................................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ....................................................*...................................................................|...................................................*................................................................. + // mls v16.4s, v12.4s, v8.s[0] // ...........................................................................*............................................|..........................................................................*.......................................... + // cmge v27.4s, v31.4s, v13.4s // ......................................................*.................................................................|.....................................................*............................................................... + // cmge v28.4s, v13.4s, v30.4s // ..............................................................*.........................................................|.............................................................*....................................................... + // sub v28.4s, v27.4s, v28.4s // ......................................................................*.................................................|.....................................................................*............................................... + // mls v13.4s, v28.4s, v29.4s // ................................................................................*.......................................|...............................................................................*..................................... + // cmge v27.4s, v31.4s, v14.4s // .....................................................*..................................................................|....................................................*................................................................ + // cmge v28.4s, v14.4s, v30.4s // ................................................................*.......................................................|...............................................................*..................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................................................*....................................|..................................................................................*.................................. + // mls v14.4s, v28.4s, v29.4s // ....................................................................................................*...................|...................................................................................................*................. + // cmge v27.4s, v31.4s, v15.4s // ....................................*...................................................................................|...................................*................................................................................. + // cmge v28.4s, v15.4s, v30.4s // ......................................*.................................................................................|.....................................*............................................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................*............................................................................|..........................................*.......................................................................... + // mls v15.4s, v28.4s, v29.4s // ........................................................*...............................................................|.......................................................*............................................................. + // cmge v27.4s, v31.4s, v16.4s // ........................................................................................*...............................|.......................................................................................*............................. + // cmge v28.4s, v16.4s, v30.4s // .......................................................................................*................................|......................................................................................*.............................. + // sub v28.4s, v27.4s, v28.4s // ................................................................................................*.......................|...............................................................................................*..................... + // mls v16.4s, v28.4s, v29.4s // ........................................................................................................*...............|.......................................................................................................*............. + // str q13, [x0], #(16) // ............................................................................................*...........................|...........................................................................................*......................... + // str q14, [x0, #(-16 + 1*(1024/8))] // .................................................................................................................*......|................................................................................................................*.... + // str q15, [x0, #(-16 + 2*(1024/8))] // .......................................................................*................................................|......................................................................*.............................................. + // str q16, [x0, #(-16 + 3*(1024/8))] // ....................................................................................................................*...|...................................................................................................................*. + + sub count, count, #1 + cbnz count, layer123_start + mls v28.4S, v24.4S, v8.S[0] // .......*.............................................................. + mul v13.4S, v15.4S, v1.S[0] // ....*................................................................. + sqrdmulh v21.4S, v15.4S, v1.S[1] // .....*................................................................ + sub v11.4S, v17.4S, v4.4S // ...*.................................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + add v20.4S, v17.4S, v4.4S // *..................................................................... + add v15.4S, v5.4S, v22.4S // ........*............................................................. + mls v14.4S, v7.4S, v8.S[0] // .*.................................................................... + mls v12.4S, v23.4S, v8.S[0] // ..*................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v9.4S, v5.4S, v22.4S // ......*............................................................... + sqrdmulh v4.4S, v11.4S, v0.S[1] // ............*......................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v23.4S, v20.4S, v26.4S // ..........*........................................................... + mul v20.4S, v20.4S, v25.4S // .........*............................................................ + sqrdmulh v19.4S, v15.4S, v26.4S // ...............*...................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v13.4S, v21.4S, v8.S[0] // ................*..................................................... + cmge v6.4S, v31.4S, v12.4S // ........................................*............................. + mul v18.4S, v9.4S, v0.S[0] // ..............*....................................................... + mul v27.4S, v15.4S, v25.4S // .................*.................................................... + mul v16.4S, v11.4S, v0.S[0] // ...........*.......................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v10.4S, v31.4S, v14.4S // ....................*................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v7.4S, v14.4S, v30.4S // .....................*................................................ + cmge v17.4S, v12.4S, v30.4S // ..............................................*....................... + add v5.4S, v28.4S, v13.4S // .......................*.............................................. + sub v15.4S, v28.4S, v13.4S // ........................*............................................. + mls v20.4S, v23.4S, v8.S[0] // ..................*................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v22.4S, v9.4S, v0.S[1] // .............*........................................................ + sub v28.4S, v6.4S, v17.4S // ..................................................*................... + mls v16.4S, v4.4S, v8.S[0] // ...................*.................................................. + mls v27.4S, v19.4S, v8.S[0] // .........................*............................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v21.4S, v5.4S, v25.4S // .............................................*........................ + sqrdmulh v9.4S, v5.4S, v26.4S // ......................................*............................... + sqrdmulh v11.4S, v15.4S, v0.S[1] // ..................................*................................... + mul v17.4S, v15.4S, v0.S[0] // .........................................*............................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v12.4S, v28.4S, v29.4S // .......................................................*.............. + sub v4.4S, v10.4S, v7.4S // ...........................*.......................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v10.4S, v31.4S, v20.4S // ..........................*........................................... + mls v18.4S, v22.4S, v8.S[0] // ......................*............................................... + cmge v19.4S, v20.4S, v30.4S // ............................*......................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v24.4S, v16.4S, v30.4S // ..............................*....................................... + cmge v15.4S, v31.4S, v16.4S // .............................*........................................ + mls v14.4S, v4.4S, v29.4S // ...................................*.................................. + mls v21.4S, v9.4S, v8.S[0] // .....................................................*................ + mls v17.4S, v11.4S, v8.S[0] // .................................................*.................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q12, [x0], #(16) // ..............................................................*....... + sub v5.4S, v10.4S, v19.4S // .................................*.................................... + sub v4.4S, v15.4S, v24.4S // .....................................*................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v15.4S, v18.4S, v30.4S // ................................*..................................... + cmge v23.4S, v27.4S, v30.4S // ...............................................*...................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + cmge v24.4S, v31.4S, v27.4S // .......................................*.............................. + cmge v6.4S, v31.4S, v18.4S // ...............................*...................................... + str q14, [x0, #496] // ................................................*..................... + cmge v13.4S, v17.4S, v30.4S // .........................................................*............ + cmge v28.4S, v31.4S, v17.4S // ..........................................................*........... + cmge v14.4S, v21.4S, v30.4S // ...........................................................*.......... + cmge v22.4S, v31.4S, v21.4S // ............................................................*......... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v20.4S, v5.4S, v29.4S // ..........................................*........................... + sub v10.4S, v6.4S, v15.4S // ....................................*................................. + sub v9.4S, v24.4S, v23.4S // ........................................................*............. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v16.4S, v4.4S, v29.4S // ............................................*......................... + sub v12.4S, v28.4S, v13.4S // .............................................................*........ + sub v22.4S, v22.4S, v14.4S // ...............................................................*...... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v27.4S, v9.4S, v29.4S // ................................................................*..... + mls v18.4S, v10.4S, v29.4S // ...........................................*.......................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v21.4S, v22.4S, v29.4S // ..................................................................*... + mls v17.4S, v12.4S, v29.4S // .................................................................*.... + str q20, [x0, #240] // ...................................................*.................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q16, [x0, #752] // ......................................................*............... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q27, [x0, #112] // ...................................................................*.. + str q18, [x0, #624] // ....................................................*................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q21, [x0, #368] // ....................................................................*. + str q17, [x0, #880] // .....................................................................* + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + + // original source code + // add v19.4S, v17.4S, v4.4S // ....*................................................................. + // mls v14.4S, v7.4S, v8.S[0] // ......*............................................................... + // mls v12.4S, v23.4S, v8.S[0] // .......*.............................................................. + // sub v27.4S, v17.4S, v4.4S // ...*.................................................................. + // mul v4.4S, v15.4S, v1.S[0] // .*.................................................................... + // sqrdmulh v18.4S, v15.4S, v1.S[1] // ..*................................................................... + // sub v15.4S, v5.4S, v22.4S // ........*............................................................. + // mls v28.4S, v24.4S, v8.S[0] // *..................................................................... + // add v11.4S, v5.4S, v22.4S // .....*................................................................ + // mul v13.4S, v19.4S, v25.4S // ...........*.......................................................... + // sqrdmulh v10.4S, v19.4S, v26.4S // ..........*........................................................... + // mul v5.4S, v27.4S, v0.S[0] // .................*.................................................... + // sqrdmulh v21.4S, v27.4S, v0.S[1] // .........*............................................................ + // sqrdmulh v9.4S, v15.4S, v0.S[1] // ........................*............................................. + // mul v19.4S, v15.4S, v0.S[0] // ...............*...................................................... + // sqrdmulh v17.4S, v11.4S, v26.4S // ............*......................................................... + // mls v4.4S, v18.4S, v8.S[0] // .............*........................................................ + // mul v15.4S, v11.4S, v25.4S // ................*..................................................... + // mls v13.4S, v10.4S, v8.S[0] // .......................*.............................................. + // mls v5.4S, v21.4S, v8.S[0] // ..........................*........................................... + // cmge v21.4S, v31.4S, v14.4S // ..................*................................................... + // cmge v10.4S, v14.4S, v30.4S // ...................*.................................................. + // mls v19.4S, v9.4S, v8.S[0] // ...................................*.................................. + // add v27.4S, v28.4S, v4.4S // .....................*................................................ + // sub v9.4S, v28.4S, v4.4S // ......................*............................................... + // mls v15.4S, v17.4S, v8.S[0] // ...........................*.......................................... + // cmge v22.4S, v31.4S, v13.4S // ..................................*................................... + // sub v10.4S, v21.4S, v10.4S // .................................*.................................... + // cmge v17.4S, v13.4S, v30.4S // ....................................*................................. + // cmge v21.4S, v31.4S, v5.4S // ......................................*............................... + // cmge v4.4S, v5.4S, v30.4S // .....................................*................................ + // cmge v6.4S, v31.4S, v19.4S // ................................................*..................... + // cmge v18.4S, v19.4S, v30.4S // .............................................*........................ + // sub v22.4S, v22.4S, v17.4S // ...........................................*.......................... + // sqrdmulh v11.4S, v9.4S, v0.S[1] // ..............................*....................................... + // mls v14.4S, v10.4S, v29.4S // .......................................*.............................. + // sub v10.4S, v6.4S, v18.4S // .......................................................*.............. + // sub v18.4S, v21.4S, v4.4S // ............................................*......................... + // sqrdmulh v4.4S, v27.4S, v26.4S // .............................*........................................ + // cmge v6.4S, v31.4S, v15.4S // ...............................................*...................... + // cmge v21.4S, v31.4S, v12.4S // ..............*....................................................... + // mul v9.4S, v9.4S, v0.S[0] // ...............................*...................................... + // mls v13.4S, v22.4S, v29.4S // ......................................................*............... + // mls v19.4S, v10.4S, v29.4S // .............................................................*........ + // mls v5.4S, v18.4S, v29.4S // .........................................................*............ + // mul v18.4S, v27.4S, v25.4S // ............................*......................................... + // cmge v10.4S, v12.4S, v30.4S // ....................*................................................. + // cmge v23.4S, v15.4S, v30.4S // ..............................................*....................... + // str q14, [x0, #512] // .................................................*.................... + // mls v9.4S, v11.4S, v8.S[0] // .........................................*............................ + // sub v10.4S, v21.4S, v10.4S // .........................*............................................ + // str q13, [x0, #256] // ................................................................*..... + // str q19, [x0, #640] // ...................................................................*.. + // mls v18.4S, v4.4S, v8.S[0] // ........................................*............................. + // str q5, [x0, #768] // .................................................................*.... + // mls v12.4S, v10.4S, v29.4S // ................................*..................................... + // sub v5.4S, v6.4S, v23.4S // ........................................................*............. + // cmge v6.4S, v9.4S, v30.4S // ..................................................*................... + // cmge v22.4S, v31.4S, v9.4S // ...................................................*.................. + // cmge v14.4S, v18.4S, v30.4S // ....................................................*................. + // cmge v4.4S, v31.4S, v18.4S // .....................................................*................ + // sub v6.4S, v22.4S, v6.4S // ..........................................................*........... + // str q12, [x0], #(16) // ..........................................*........................... + // sub v12.4S, v4.4S, v14.4S // ...........................................................*.......... + // mls v15.4S, v5.4S, v29.4S // ............................................................*......... + // mls v9.4S, v6.4S, v29.4S // ...............................................................*...... + // mls v18.4S, v12.4S, v29.4S // ..............................................................*....... + // str q15, [x0, #112] // ..................................................................*... + // str q18, [x0, #368] // ....................................................................*. + // str q9, [x0, #880] // .....................................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_icestorm.s b/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_icestorm.s new file mode 100644 index 00000000..eb5264de --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_123_45678_opt_m1_icestorm.s @@ -0,0 +1,1999 @@ + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. +xtmp0 .req x10 +xtmp1 .req x11 + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm +.macro vqrdmulh d,a,b + sqrdmulh \d\().4s, \a\().4s, \b\().4s +.endm +.macro vmls d,a,b + mls \d\().4s, \a\().4s, \b\().4s +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().4s, \a\().4s, \b\().4s[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().4s, \a\().4s, \b\().s[\i] +.endm +.macro vmlsq d,a,b,i + mls \d\().4s, \a\().4s, \b\().s[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().4s, \src\().4s, \const\().4s + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro barrett_reduce_single a + srshr tmp.4S, \a\().4S, #23 + vmls \a, tmp, consts +.endm + +.macro canonical_reduce a, modulus_half, neg_modulus_half, tmp1, tmp2 + cmge \tmp1\().4s, \neg_modulus_half\().4s, \a\().4s + cmge \tmp2\().4s, \a\().4s, \modulus_half\().4s + sub \tmp2\().4s, \tmp1\().4s, \tmp2\().4s + vmls \a, \tmp2, consts +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.4s, \a\().4s, \b\().4s + add \a\().4s, \a\().4s, \b\().4s + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro load_vectors a0, a1, a2, a3, addr + ldr_vo \a0, \addr, (16*0) + ldr_vo \a1, \addr, (16*1) + ldr_vo \a2, \addr, (16*2) + ldr_vo \a3, \addr, (16*3) +.endm + +.macro load_vectors_with_offset a0, a1, a2, a3, addr, offset + ldr_vo \a0, \addr, (16*0 + (\offset)) + ldr_vo \a1, \addr, (16*1 + (\offset)) + ldr_vo \a2, \addr, (16*2 + (\offset)) + ldr_vo \a3, \addr, (16*3 + (\offset)) +.endm + +.macro store_vectors_with_inc a0, a1, a2, a3, addr, inc + str_vi \a0, \addr, \inc + str_vo \a1, \addr, (-(\inc) + 16*1) + str_vo \a2, \addr, (-(\inc) + 16*2) + str_vo \a3, \addr, (-(\inc) + 16*3) +.endm + +.macro vec_to_scalar_matrix out, in + vext \out\()_00, \in\()0, 0 + vext \out\()_01, \in\()0, 1 + vext \out\()_10, \in\()1, 0 + vext \out\()_11, \in\()1, 1 + vext \out\()_20, \in\()2, 0 + vext \out\()_21, \in\()2, 1 + vext \out\()_30, \in\()3, 0 + vext \out\()_31, \in\()3, 1 +.endm + +.macro store_scalar_matrix_with_inc x, addr, inc + str \x\()t_00, [\addr], #( \inc) + str \x\()t_01, [\addr, #(-\inc + 8*1)] + str \x\()t_10, [\addr, #(-\inc + 8*2)] + str \x\()t_11, [\addr, #(-\inc + 8*3)] + str \x\()t_20, [\addr, #(-\inc + 8*4)] + str \x\()t_21, [\addr, #(-\inc + 8*5)] + str \x\()t_30, [\addr, #(-\inc + 8*6)] + str \x\()t_31, [\addr, #(-\inc + 8*7)] +.endm + +.macro vext gpr_out, vec_in, lane + umov \gpr_out\(), \vec_in\().d[\lane] +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_456 + ldr_vi root0, r_ptr0, 64 + ldr_vo root1, r_ptr0, (-64 + 16) + ldr_vo root2, r_ptr0, (-64 + 32) + ldr_vo root3, r_ptr0, (-64 + 48) +.endm + +.macro load_roots_78_part1 + ldr_vi root0, r_ptr1, (12*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 1*16) + ldr_vo root1, r_ptr1, (-12*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 3*16) + ldr_vo root2, r_ptr1, (-12*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 5*16) +.endm + +.macro load_roots_78_part2 + ldr_vo root0, r_ptr1, (-12*16 + 6*16) + ldr_vo root0_tw, r_ptr1, (-12*16 + 7*16) + ldr_vo root1, r_ptr1, (-12*16 + 8*16) + ldr_vo root1_tw, r_ptr1, (-12*16 + 9*16) + ldr_vo root2, r_ptr1, (-12*16 + 10*16) + ldr_vo root2_tw, r_ptr1, (-12*16 + 11*16) +.endm + +.macro transpose4 data0, data1, data2, data3 + trn1 t0.4s, \data0\().4s, \data1\().4s + trn2 t1.4s, \data0\().4s, \data1\().4s + trn1 t2.4s, \data2\().4s, \data3\().4s + trn2 t3.4s, \data2\().4s, \data3\().4s + + trn2 \data2\().2d, t0.2d, t2.2d + trn2 \data3\().2d, t1.2d, t3.2d + trn1 \data0\().2d, t0.2d, t2.2d + trn1 \data1\().2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out0, data_out1, data_out2, data_out3, data_in0, data_in1, data_in2, data_in3 + trn1 \data_out0\().4s, \data_in0\().4s, \data_in1\().4s + trn2 \data_out1\().4s, \data_in0\().4s, \data_in1\().4s + trn1 \data_out2\().4s, \data_in2\().4s, \data_in3\().4s + trn2 \data_out3\().4s, \data_in2\().4s, \data_in3\().4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_dilithium_123_456_78_twiddles.s" +.text + + .global intt_dilithium_123_45678_opt_m1_icestorm + .global _intt_dilithium_123_45678_opt_m1_icestorm + +.p2align 4 +const_addr: .word 8380417 + .word 0 + .word 0 + .word 0 +ninv_addr: .quad 16382 +ninv_tw_addr: .quad 4197891 +intt_dilithium_123_45678_opt_m1_icestorm: +_intt_dilithium_123_45678_opt_m1_icestorm: + push_stack + + in .req x0 + inp .req x1 + inpp .req x2 + count .req x3 + r_ptr0 .req x4 + r_ptr1 .req x5 + xtmp .req x6 + + data0 .req v9 + data1 .req v10 + data2 .req v11 + data3 .req v12 + data4 .req v13 + data5 .req v14 + data6 .req v15 + data7 .req v16 + + qform_data0 .req q9 + qform_data1 .req q10 + qform_data2 .req q11 + qform_data3 .req q12 + qform_data4 .req q13 + qform_data5 .req q14 + qform_data6 .req q15 + qform_data7 .req q16 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root3 .req v3 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root3 .req q3 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + tp0 .req v17 + tp1 .req v18 + tp2 .req v19 + tp3 .req v20 + + consts .req v8 + qform_consts .req q8 + + ASM_LOAD(r_ptr0, roots_l345) + ASM_LOAD(r_ptr1, roots_l67) + + ASM_LOAD(xtmp, const_addr) + ld1r {consts.4s}, [xtmp] + save STACK0, in + + restore inp, STACK0 + mov inp, in + add inpp, inp, #64 + mov count, #8 + + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + root3_tw .req v7 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + qform_root3_tw .req q7 + + .p2align 2 + // gap // ........................................................................................................................... + ldr q10, [x5, #144] // ..........*................................................................................................................ + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // ........*.................................................................................................................. + ld4 {v23.4S, v24.4S, v25.4S, v26.4S}, [x2] // .................*......................................................................................................... + ldr q9, [x5, #160] // ...........*............................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + ldr q21, [x5, #80] // *.......................................................................................................................... + ldr q1, [x5, #176] // .*......................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + ldr q6, [x5, #128] // ...*....................................................................................................................... + ldr q30, [x5, #64] // ....*...................................................................................................................... + ldr q27, [x5, #32] // ......*.................................................................................................................... + // gap // ........................................................................................................................... + ldr q28, [x5, #48] // .....*..................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sub v20.4S, v16.4S, v17.4S // ..............*............................................................................................................ + sub v31.4S, v14.4S, v15.4S // .............*............................................................................................................. + // gap // ........................................................................................................................... + sub v13.4S, v23.4S, v24.4S // .............................*............................................................................................. + sub v7.4S, v25.4S, v26.4S // ............................*.............................................................................................. + // gap // ........................................................................................................................... + ldr q5, [x5, #112] // .......*................................................................................................................... + mul v2.4S, v31.4S, v27.4S // ..................*........................................................................................................ + ldr q27, [x5, #16] // .........*................................................................................................................. + sqrdmulh v4.4S, v31.4S, v28.4S // ...................*....................................................................................................... + // gap // ........................................................................................................................... + sqrdmulh v31.4S, v20.4S, v21.4S // ....................*...................................................................................................... + mul v12.4S, v7.4S, v9.4S // .....................................*..................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + ldr q21, [x5], #(12*16) // ..*........................................................................................................................ + sqrdmulh v22.4S, v13.4S, v10.4S // .......................................*................................................................................... + ldr q11, [x5, #-96] // ......................*.................................................................................................... + mul v10.4S, v20.4S, v30.4S // .....................*..................................................................................................... + mls v2.4S, v4.4S, v8.S[0] // .........................*................................................................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sqrdmulh v20.4S, v7.4S, v1.4S // .................................*......................................................................................... + add v28.4S, v25.4S, v26.4S // ...............................*........................................................................................... + add v26.4S, v14.4S, v15.4S // ................*.......................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + add v18.4S, v16.4S, v17.4S // ...............*........................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v10.4S, v31.4S, v8.S[0] // ..........................*................................................................................................ + mul v9.4S, v13.4S, v6.4S // ...................................*....................................................................................... + // gap // ........................................................................................................................... + mls v12.4S, v20.4S, v8.S[0] // .............................................*............................................................................. + // gap // ........................................................................................................................... + sub v1.4S, v26.4S, v18.4S // ........................*.................................................................................................. + // gap // ........................................................................................................................... + add v23.4S, v23.4S, v24.4S // ................................*.......................................................................................... + // gap // ........................................................................................................................... + add v20.4S, v2.4S, v10.4S // ...............................................*........................................................................... + sub v13.4S, v2.4S, v10.4S // ....................................*...................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sqrdmulh v31.4S, v1.4S, v27.4S // .........................................*................................................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v9.4S, v22.4S, v8.S[0] // ..............................................*............................................................................ + mul v22.4S, v13.4S, v21.4S // ...........................................*............................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sub v4.4S, v23.4S, v28.4S // ......................................*.................................................................................... + mul v24.4S, v1.4S, v21.4S // ..................................*........................................................................................ + sqrdmulh v1.4S, v13.4S, v27.4S // ........................................*.................................................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + add v13.4S, v9.4S, v12.4S // ......................................................*.................................................................... + sub v12.4S, v9.4S, v12.4S // ...................................................*....................................................................... + ldr q30, [x4, #48] // ...........................................................................*............................................... + sqrdmulh v9.4S, v4.4S, v5.4S // ..........................................*................................................................................ + add v26.4S, v26.4S, v18.4S // .......................*................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v24.4S, v31.4S, v8.S[0] // ................................................*.......................................................................... + mls v22.4S, v1.4S, v8.S[0] // .................................................*......................................................................... + mul v10.4S, v4.4S, v11.4S // ............................................*.............................................................................. + add v1.4S, v23.4S, v28.4S // ....................................................*...................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + ldr q29, [x4, #16] // ............*.............................................................................................................. + ldr q19, [x4], #64 // ...........................*............................................................................................... + trn2 v16.4S, v26.4S, v20.4S // .....................................................*..................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + trn1 v20.4S, v26.4S, v20.4S // ...........................................................*............................................................... + trn1 v26.4S, v24.4S, v22.4S // ............................................................*.............................................................. + trn2 v4.4S, v24.4S, v22.4S // .........................................................*................................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v10.4S, v9.4S, v8.S[0] // ..................................................*........................................................................ + // gap // ........................................................................................................................... + ldr q14, [x4, #-32] // ..............................*............................................................................................ + trn2 v23.4S, v1.4S, v13.4S // ..........................................................*................................................................ + sqrdmulh v5.4S, v12.4S, v5.4S // ........................................................*.................................................................. + trn1 v17.2D, v20.2D, v26.2D // ...............................................................*........................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mul v25.4S, v12.4S, v11.4S // .......................................................*................................................................... + // gap // ........................................................................................................................... + trn2 v9.2D, v20.2D, v26.2D // .................................................................*......................................................... + trn2 v24.2D, v16.2D, v4.2D // ..............................................................*............................................................ + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + trn1 v1.4S, v1.4S, v13.4S // ..................................................................*........................................................ + trn1 v7.2D, v16.2D, v4.2D // ................................................................*.......................................................... + sub v12.4S, v9.4S, v24.4S // .................................................................................*......................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v25.4S, v5.4S, v8.S[0] // .............................................................*............................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + add v20.4S, v9.4S, v24.4S // ......................................................................*.................................................... + add v13.4S, v17.4S, v7.4S // .....................................................................*..................................................... + mul v31.4S, v12.4S, v14.S[0] // .......................................................................................*................................... + sqrdmulh v22.4S, v12.4S, v14.S[1] // ......................................................................................*.................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + trn2 v26.4S, v10.4S, v25.4S // ....................................................................*...................................................... + trn1 v24.4S, v10.4S, v25.4S // ...................................................................*....................................................... + sub v12.4S, v13.4S, v20.4S // ...................................................................................................*....................... + add v10.4S, v13.4S, v20.4S // ...............................................................................*........................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + trn1 v20.2D, v23.2D, v26.2D // ..........................................................................*................................................ + trn1 v13.2D, v1.2D, v24.2D // .........................................................................*................................................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + trn2 v0.2D, v23.2D, v26.2D // ........................................................................*.................................................. + trn2 v16.2D, v1.2D, v24.2D // .......................................................................*................................................... + sub v9.4S, v13.4S, v20.4S // .............................................................................................*............................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mul v4.4S, v12.4S, v19.S[2] // ..........................................................................................................................* + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + add v26.4S, v16.4S, v0.4S // .............................................................................*............................................. + add v24.4S, v13.4S, v20.4S // ..............................................................................*............................................ + sqrdmulh v20.4S, v9.4S, v14.S[3] // .................................................................................................*......................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mul v11.4S, v9.4S, v14.S[2] // ................................................................................................*.......................... + // gap // ........................................................................................................................... + add v27.4S, v24.4S, v26.4S // ........................................................................................*.................................. + // gap // ........................................................................................................................... + sub v1.4S, v17.4S, v7.4S // ............................................................................*.............................................. + // gap // ........................................................................................................................... + srshr v9.4S, v10.4S, #23 // .....................................................................................*..................................... + // gap // ........................................................................................................................... + sub v17.4S, v16.4S, v0.4S // ................................................................................*.......................................... + mul v16.4S, v1.4S, v29.S[2] // ...................................................................................*....................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + srshr v13.4S, v27.4S, #23 // ............................................................................................*.............................. + mls v31.4S, v22.4S, v8.S[0] // ...............................................................................................*........................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sqrdmulh v1.4S, v1.4S, v29.S[3] // ....................................................................................*...................................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v10.4S, v9.4S, v8.4S // ...........................................................................................*............................... + mls v11.4S, v20.4S, v8.S[0] // .......................................................................................................*................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v27.4S, v13.4S, v8.4S // ..................................................................................................*........................ + mul v15.4S, v17.4S, v30.S[0] // .........................................................................................*................................. + // gap // ........................................................................................................................... + mls v16.4S, v1.4S, v8.S[0] // ..........................................................................................*................................ + // gap // ........................................................................................................................... + sqrdmulh v7.4S, v17.4S, v30.S[1] // ..............................................................................................*............................ + sqrdmulh v30.4S, v12.4S, v19.S[3] // ......................................................................................................................*.... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sub v9.4S, v24.4S, v26.4S // ..................................................................................*........................................ + // gap // ........................................................................................................................... + add v24.4S, v10.4S, v27.4S // ..............................................................................................................*............ + sub v22.4S, v10.4S, v27.4S // ........................................................................................................*.................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sub v10.4S, v16.4S, v31.4S // ......................................................................................................*.................... + // gap // ........................................................................................................................... + mls v15.4S, v7.4S, v8.S[0] // .....................................................................................................*..................... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mul v17.4S, v22.4S, v19.S[0] // .............................................................................................................*............. + sqrdmulh v12.4S, v22.4S, v19.S[1] // ............................................................................................................*.............. + str q24, [x1], #(16*4) // .....................................................................................................................*..... + sqrdmulh v22.4S, v10.4S, v19.S[3] // ...........................................................................................................*............... + // gap // ........................................................................................................................... + mul v10.4S, v10.4S, v19.S[2] // ..........................................................................................................*................ + // gap // ........................................................................................................................... + add v26.4S, v16.4S, v31.4S // ....................................................................................................*...................... + sub v13.4S, v11.4S, v15.4S // ...............................................................................................................*........... + // gap // ........................................................................................................................... + add v31.4S, v11.4S, v15.4S // .................................................................................................................*......... + mls v17.4S, v12.4S, v8.S[0] // ..................................................................................................................*........ + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mls v10.4S, v22.4S, v8.S[0] // ................................................................................................................*.......... + srshr v22.4S, v26.4S, #23 // .........................................................................................................*................. + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + sqrdmulh v20.4S, v13.4S, v29.S[1] // .......................................................................................................................*... + // gap // ........................................................................................................................... + // gap // ........................................................................................................................... + mul v1.4S, v13.4S, v29.S[0] // ...................................................................................................................*....... + str q17, [x2], #(16*4) // ........................................................................................................................*.. + // gap // ........................................................................................................................... + mul v7.4S, v9.4S, v29.S[0] // .........................................................................................................................*. + srshr v12.4S, v31.4S, #23 // ....................................................................................................................*...... + + // original source code + // ldr q17, [x5, #80] // ....*...................................................................................................................... + // ldr q16, [x5, #176] // .....*..................................................................................................................... + // ldr q15, [x5], #(12*16) // ....................*...................................................................................................... + // ldr q24, [x5, #-64] // ......*.................................................................................................................... + // ldr q28, [x5, #-128] // .......*................................................................................................................... + // ldr q18, [x5, #-144] // .........*................................................................................................................. + // ldr q25, [x5, #-160] // ........*.................................................................................................................. + // ldr q7, [x5, #-80] // ..............*............................................................................................................ + // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // .*......................................................................................................................... + // ldr q0, [x5, #-176] // ................*.......................................................................................................... + // ldr q30, [x5, #-48] // *.......................................................................................................................... + // ldr q27, [x5, #-32] // ...*....................................................................................................................... + // ldr q29, [x4, #16] // ...................................................*....................................................................... + // sub v22.4S, v3.4S, v4.4S // ...........*............................................................................................................... + // sub v12.4S, v5.4S, v6.4S // ..........*................................................................................................................ + // add v11.4S, v5.4S, v6.4S // ............................*.............................................................................................. + // add v20.4S, v3.4S, v4.4S // ...........................*............................................................................................... + // ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // ..*........................................................................................................................ + // mul v10.4S, v22.4S, v25.4S // ...............*........................................................................................................... + // sqrdmulh v18.4S, v22.4S, v18.4S // .................*......................................................................................................... + // sqrdmulh v26.4S, v12.4S, v17.4S // ..................*........................................................................................................ + // mul v28.4S, v12.4S, v28.4S // .......................*................................................................................................... + // ldr q31, [x5, #-96] // ......................*.................................................................................................... + // add v5.4S, v20.4S, v11.4S // ..............................................*............................................................................ + // sub v12.4S, v20.4S, v11.4S // ................................*.......................................................................................... + // mls v10.4S, v18.4S, v8.S[0] // ........................*.................................................................................................. + // mls v28.4S, v26.4S, v8.S[0] // .............................*............................................................................................. + // ldr q19, [x4], #64 // ....................................................*...................................................................... + // sub v9.4S, v3.4S, v4.4S // .............*............................................................................................................. + // sub v17.4S, v1.4S, v2.4S // ............*.............................................................................................................. + // ldr q26, [x4, #-32] // ..........................................................*................................................................ + // add v11.4S, v3.4S, v4.4S // ..........................*................................................................................................ + // add v22.4S, v1.4S, v2.4S // .................................*......................................................................................... + // sqrdmulh v4.4S, v9.4S, v16.4S // .........................*................................................................................................. + // mul v6.4S, v12.4S, v15.4S // ........................................*.................................................................................. + // mul v13.4S, v17.4S, v24.4S // ..............................*............................................................................................ + // sub v18.4S, v10.4S, v28.4S // ...................................*....................................................................................... + // mul v23.4S, v9.4S, v27.4S // ...................*....................................................................................................... + // sub v20.4S, v22.4S, v11.4S // .......................................*................................................................................... + // sqrdmulh v3.4S, v17.4S, v30.4S // .....................*..................................................................................................... + // sqrdmulh v1.4S, v18.4S, v0.4S // .........................................*................................................................................. + // sqrdmulh v17.4S, v12.4S, v0.4S // ....................................*...................................................................................... + // sqrdmulh v12.4S, v20.4S, v7.4S // .............................................*............................................................................. + // mul v24.4S, v18.4S, v15.4S // ......................................*.................................................................................... + // mul v25.4S, v20.4S, v31.4S // .................................................*......................................................................... + // mls v23.4S, v4.4S, v8.S[0] // ...............................*........................................................................................... + // mls v13.4S, v3.4S, v8.S[0] // .....................................*..................................................................................... + // add v14.4S, v10.4S, v28.4S // ..................................*........................................................................................ + // mls v6.4S, v17.4S, v8.S[0] // ...............................................*........................................................................... + // mls v24.4S, v1.4S, v8.S[0] // ................................................*.......................................................................... + // mls v25.4S, v12.4S, v8.S[0] // .........................................................*................................................................. + // sub v20.4S, v13.4S, v23.4S // ...........................................*............................................................................... + // add v27.4S, v22.4S, v11.4S // ..................................................*........................................................................ + // trn2 v9.4S, v5.4S, v14.4S // .....................................................*..................................................................... + // add v16.4S, v13.4S, v23.4S // ..........................................*................................................................................ + // mul v31.4S, v20.4S, v31.4S // ..............................................................*............................................................ + // sqrdmulh v30.4S, v20.4S, v7.4S // ............................................................*.............................................................. + // trn2 v12.4S, v6.4S, v24.4S // ........................................................*.................................................................. + // trn2 v13.4S, v27.4S, v16.4S // ...........................................................*............................................................... + // trn1 v4.4S, v5.4S, v14.4S // ......................................................*.................................................................... + // trn1 v20.4S, v6.4S, v24.4S // .......................................................*................................................................... + // mls v31.4S, v30.4S, v8.S[0] // ....................................................................*...................................................... + // trn2 v1.2D, v9.2D, v12.2D // ................................................................*.......................................................... + // trn1 v22.2D, v4.2D, v20.2D // .............................................................*............................................................. + // trn1 v17.2D, v9.2D, v12.2D // ..................................................................*........................................................ + // trn2 v4.2D, v4.2D, v20.2D // ...............................................................*........................................................... + // trn1 v9.4S, v27.4S, v16.4S // .................................................................*......................................................... + // trn1 v20.4S, v25.4S, v31.4S // ..........................................................................*................................................ + // trn2 v16.4S, v25.4S, v31.4S // .........................................................................*................................................. + // add v10.4S, v22.4S, v17.4S // ......................................................................*.................................................... + // add v31.4S, v4.4S, v1.4S // .....................................................................*..................................................... + // trn2 v12.2D, v9.2D, v20.2D // ................................................................................*.......................................... + // trn2 v30.2D, v13.2D, v16.2D // ...............................................................................*........................................... + // trn1 v23.2D, v9.2D, v20.2D // ..............................................................................*............................................ + // trn1 v28.2D, v13.2D, v16.2D // .............................................................................*............................................. + // ldr q24, [x4, #-16] // ............................................*.............................................................................. + // sub v22.4S, v22.4S, v17.4S // ........................................................................................*.................................. + // add v13.4S, v12.4S, v30.4S // ...................................................................................*....................................... + // add v20.4S, v23.4S, v28.4S // ....................................................................................*...................................... + // add v17.4S, v10.4S, v31.4S // ............................................................................*.............................................. + // sub v12.4S, v12.4S, v30.4S // ..........................................................................................*................................ + // sub v4.4S, v4.4S, v1.4S // ...................................................................*....................................................... + // sub v9.4S, v20.4S, v13.4S // ......................................................................................................*.................... + // mul v16.4S, v22.4S, v29.S[2] // ...........................................................................................*............................... + // sqrdmulh v30.4S, v22.4S, v29.S[3] // ..............................................................................................*............................ + // srshr v22.4S, v17.4S, #23 // .........................................................................................*................................. + // sqrdmulh v1.4S, v4.4S, v26.S[1] // ........................................................................*.................................................. + // mul v6.4S, v4.4S, v26.S[0] // .......................................................................*................................................... + // add v4.4S, v20.4S, v13.4S // .......................................................................................*................................... + // mul v25.4S, v12.4S, v24.S[0] // ..................................................................................................*........................ + // mls v16.4S, v30.4S, v8.S[0] // ...................................................................................................*....................... + // mls v17.4S, v22.4S, v8.4S // ...............................................................................................*........................... + // srshr v20.4S, v4.4S, #23 // ............................................................................................*.............................. + // sub v22.4S, v23.4S, v28.4S // .................................................................................*......................................... + // sqrdmulh v12.4S, v12.4S, v24.S[1] // ....................................................................................................*...................... + // mls v6.4S, v1.4S, v8.S[0] // .............................................................................................*............................. + // mul v1.4S, v22.4S, v26.S[2] // ......................................................................................*.................................... + // sqrdmulh v22.4S, v22.4S, v26.S[3] // .....................................................................................*..................................... + // mls v4.4S, v20.4S, v8.4S // .................................................................................................*......................... + // sub v24.4S, v10.4S, v31.4S // ...........................................................................*............................................... + // add v26.4S, v16.4S, v6.4S // ................................................................................................................*.......... + // mls v25.4S, v12.4S, v8.S[0] // ..........................................................................................................*................ + // sub v30.4S, v16.4S, v6.4S // .........................................................................................................*................. + // mls v1.4S, v22.4S, v8.S[0] // ................................................................................................*.......................... + // sub v20.4S, v17.4S, v4.4S // ........................................................................................................*.................. + // srshr v22.4S, v26.4S, #23 // .....................................................................................................................*..... + // mul v10.4S, v30.4S, v19.S[2] // ...............................................................................................................*........... + // sqrdmulh v16.4S, v30.4S, v19.S[3] // ..............................................................................................................*............ + // sqrdmulh v12.4S, v20.4S, v19.S[1] // ............................................................................................................*.............. + // mul v13.4S, v20.4S, v19.S[0] // ...........................................................................................................*............... + // add v20.4S, v17.4S, v4.4S // .......................................................................................................*................... + // sub v4.4S, v1.4S, v25.4S // .................................................................................................................*......... + // mls v10.4S, v16.4S, v8.S[0] // ....................................................................................................................*...... + // add v31.4S, v1.4S, v25.4S // ..................................................................................................................*........ + // mls v13.4S, v12.4S, v8.S[0] // ...................................................................................................................*....... + // mul v1.4S, v4.4S, v29.S[0] // .......................................................................................................................*... + // srshr v12.4S, v31.4S, #23 // ..........................................................................................................................* + // str q20, [x1], #(16*4) // .............................................................................................................*............. + // sqrdmulh v30.4S, v24.4S, v19.S[3] // .....................................................................................................*..................... + // sqrdmulh v20.4S, v4.4S, v29.S[1] // ......................................................................................................................*.... + // str q13, [x2], #(16*4) // ........................................................................................................................*.. + // mul v7.4S, v9.4S, v29.S[0] // .........................................................................................................................*. + // mul v4.4S, v24.4S, v19.S[2] // ..................................................................................*........................................ + + sub count, count, #1 +layer45678_start: + sqrdmulh v13.4S, v9.4S, v29.S[1] // ...........................................................................................................*............................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v26.4S, v22.4S, v8.4S // .....................................................................................................................*.................................. + mls v31.4S, v12.4S, v8.4S // .........................................................................................................................*.............................. + ldr q17, [x5, #80] // .......e................................................................................................................................................ + ldr q16, [x5, #176] // .................................e...................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v1.4S, v20.4S, v8.S[0] // .................................................................................................................*...................................... + // gap // ........................................................................................................................................................ + ldr q15, [x5], #(12*16) // ..e..................................................................................................................................................... + mls v7.4S, v13.4S, v8.S[0] // ............................................................................................................*........................................... + mls v4.4S, v30.4S, v8.S[0] // ..................................................................................................*..................................................... + sub v30.4S, v26.4S, v31.4S // ...............................................................................................................................*........................ + add v22.4S, v26.4S, v31.4S // ................................................................................................................................*....................... + ldr q24, [x5, #-64] // ..............................e......................................................................................................................... + ldr q28, [x5, #-128] // ......e................................................................................................................................................. + add v12.4S, v10.4S, v1.4S // ..........................................................................................................................................*............. + // gap // ........................................................................................................................................................ + ldr q18, [x5, #-144] // .....e.................................................................................................................................................. + sub v1.4S, v10.4S, v1.4S // .........................................................................................................................................*.............. + ldr q25, [x5, #-160] // ....e................................................................................................................................................... + add v9.4S, v4.4S, v7.4S // .....................................................................................................................................*.................. + sub v4.4S, v4.4S, v7.4S // ....................................................................................................................................*................... + str q22, [x1, #-48] // ...............................................................................................................................................*........ + str q12, [x1, #-16] // .................................................................................................................................................*...... + ldr q7, [x5, #-80] // .............................e.......................................................................................................................... + mul v20.4S, v1.4S, v19.S[0] // ...........................................................................................................................................*............ + sqrdmulh v12.4S, v1.4S, v19.S[1] // ............................................................................................................................................*........... + sqrdmulh v31.4S, v4.4S, v19.S[1] // .......................................................................................................................................*................ + mul v13.4S, v4.4S, v19.S[0] // ......................................................................................................................................*................. + str q9, [x1, #-32] // ................................................................................................................................................*....... + add x1, x1, #64 // ......................................................................................................................................................*. + ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // e....................................................................................................................................................... + sqrdmulh v22.4S, v30.4S, v19.S[1] // ..................................................................................................................................*..................... + mul v1.4S, v30.4S, v19.S[0] // .................................................................................................................................*...................... + ldr q0, [x5, #-176] // ...e.................................................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q30, [x5, #-48] // ...............................e........................................................................................................................ + mls v20.4S, v12.4S, v8.S[0] // .............................................................................................................................................*.......... + // gap // ........................................................................................................................................................ + ldr q27, [x5, #-32] // ................................e....................................................................................................................... + // gap // ........................................................................................................................................................ + mls v13.4S, v31.4S, v8.S[0] // ........................................................................................................................................*............... + mls v1.4S, v22.4S, v8.S[0] // ...................................................................................................................................*.................... + // gap // ........................................................................................................................................................ + ldr q29, [x4, #16] // .......................................................................e................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q20, [x2, #-16] // .....................................................................................................................................................*.. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + str q13, [x2, #-32] // ....................................................................................................................................................*... + sub v22.4S, v3.4S, v4.4S // ........e............................................................................................................................................... + str q1, [x2, #-48] // ...................................................................................................................................................*.... + add x2, x2, #64 // .......................................................................................................................................................* + sub v12.4S, v5.4S, v6.4S // .............e.......................................................................................................................................... + add v11.4S, v5.4S, v6.4S // ..............e......................................................................................................................................... + add v20.4S, v3.4S, v4.4S // .........e.............................................................................................................................................. + // gap // ........................................................................................................................................................ + ld4 {v1.4S, v2.4S, v3.4S, v4.4S}, [x2] // .e...................................................................................................................................................... + mul v10.4S, v22.4S, v25.4S // ..........e............................................................................................................................................. + sqrdmulh v18.4S, v22.4S, v18.4S // ...........e............................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v26.4S, v12.4S, v17.4S // ................e....................................................................................................................................... + mul v28.4S, v12.4S, v28.4S // ...............e........................................................................................................................................ + // gap // ........................................................................................................................................................ + ldr q31, [x5, #-96] // ............................e........................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v5.4S, v20.4S, v11.4S // ...................e.................................................................................................................................... + sub v12.4S, v20.4S, v11.4S // ..................e..................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v10.4S, v18.4S, v8.S[0] // ............e........................................................................................................................................... + // gap // ........................................................................................................................................................ + mls v28.4S, v26.4S, v8.S[0] // .................e...................................................................................................................................... + ldr q19, [x4], #64 // ......................................................................e................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v9.4S, v3.4S, v4.4S // .......................................e................................................................................................................ + sub v17.4S, v1.4S, v2.4S // ..................................e..................................................................................................................... + // gap // ........................................................................................................................................................ + ldr q26, [x4, #-32] // ........................................................................e............................................................................... + add v11.4S, v3.4S, v4.4S // ........................................e............................................................................................................... + add v22.4S, v1.4S, v2.4S // ...................................e.................................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v4.4S, v9.4S, v16.4S // ..........................................e............................................................................................................. + mul v6.4S, v12.4S, v15.4S // ....................e................................................................................................................................... + // gap // ........................................................................................................................................................ + mul v13.4S, v17.4S, v24.4S // ....................................e................................................................................................................... + sub v18.4S, v10.4S, v28.4S // .......................e................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v23.4S, v9.4S, v27.4S // .........................................e.............................................................................................................. + // gap // ........................................................................................................................................................ + sub v20.4S, v22.4S, v11.4S // ............................................e........................................................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v3.4S, v17.4S, v30.4S // .....................................e.................................................................................................................. + sqrdmulh v1.4S, v18.4S, v0.4S // ..........................e............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v17.4S, v12.4S, v0.4S // .....................e.................................................................................................................................. + sqrdmulh v12.4S, v20.4S, v7.4S // ...............................................e........................................................................................................ + // gap // ........................................................................................................................................................ + mul v24.4S, v18.4S, v15.4S // .........................e.............................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v25.4S, v20.4S, v31.4S // ..............................................e......................................................................................................... + mls v23.4S, v4.4S, v8.S[0] // ...........................................e............................................................................................................ + mls v13.4S, v3.4S, v8.S[0] // ......................................e................................................................................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v14.4S, v10.4S, v28.4S // ........................e............................................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v6.4S, v17.4S, v8.S[0] // ......................e................................................................................................................................. + mls v24.4S, v1.4S, v8.S[0] // ...........................e............................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v25.4S, v12.4S, v8.S[0] // ................................................e....................................................................................................... + sub v20.4S, v13.4S, v23.4S // .................................................e...................................................................................................... + add v27.4S, v22.4S, v11.4S // .............................................e.......................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v9.4S, v5.4S, v14.4S // .......................................................e................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v16.4S, v13.4S, v23.4S // ..................................................e..................................................................................................... + mul v31.4S, v20.4S, v31.4S // ...................................................e.................................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v20.4S, v7.4S // ....................................................e................................................................................................... + // gap // ........................................................................................................................................................ + trn2 v12.4S, v6.4S, v24.4S // .........................................................e.............................................................................................. + // gap // ........................................................................................................................................................ + trn2 v13.4S, v27.4S, v16.4S // ...............................................................e........................................................................................ + trn1 v4.4S, v5.4S, v14.4S // ......................................................e................................................................................................. + trn1 v20.4S, v6.4S, v24.4S // ........................................................e............................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v31.4S, v30.4S, v8.S[0] // .....................................................e.................................................................................................. + trn2 v1.2D, v9.2D, v12.2D // ...........................................................e............................................................................................ + trn1 v22.2D, v4.2D, v20.2D // ............................................................e........................................................................................... + trn1 v17.2D, v9.2D, v12.2D // .............................................................e.......................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v4.2D, v4.2D, v20.2D // ..........................................................e............................................................................................. + trn1 v9.4S, v27.4S, v16.4S // ..............................................................e......................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn1 v20.4S, v25.4S, v31.4S // ................................................................e....................................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v16.4S, v25.4S, v31.4S // .................................................................e...................................................................................... + add v10.4S, v22.4S, v17.4S // ...........................................................................e............................................................................ + add v31.4S, v4.4S, v1.4S // ................................................................................e....................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + trn2 v12.2D, v9.2D, v20.2D // ..................................................................e..................................................................................... + trn2 v30.2D, v13.2D, v16.2D // ...................................................................e.................................................................................... + // gap // ........................................................................................................................................................ + trn1 v23.2D, v9.2D, v20.2D // ....................................................................e................................................................................... + trn1 v28.2D, v13.2D, v16.2D // .....................................................................e.................................................................................. + // gap // ........................................................................................................................................................ + ldr q24, [x4, #-16] // .........................................................................e.............................................................................. + // gap // ........................................................................................................................................................ + sub v22.4S, v22.4S, v17.4S // ..........................................................................e............................................................................. + add v13.4S, v12.4S, v30.4S // ..........................................................................................e............................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v20.4S, v23.4S, v28.4S // .....................................................................................e.................................................................. + add v17.4S, v10.4S, v31.4S // ...............................................................................................e........................................................ + sub v12.4S, v12.4S, v30.4S // .........................................................................................e.............................................................. + sub v4.4S, v4.4S, v1.4S // ...............................................................................e........................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v9.4S, v20.4S, v13.4S // ........................................................................................................e............................................... + mul v16.4S, v22.4S, v29.S[2] // ............................................................................e........................................................................... + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v22.4S, v29.S[3] // .............................................................................e.......................................................................... + srshr v22.4S, v17.4S, #23 // ..................................................................................................................e..................................... + // gap // ........................................................................................................................................................ + sqrdmulh v1.4S, v4.4S, v26.S[1] // ..................................................................................e..................................................................... + mul v6.4S, v4.4S, v26.S[0] // .................................................................................e...................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v4.4S, v20.4S, v13.4S // .........................................................................................................e.............................................. + mul v25.4S, v12.4S, v24.S[0] // ...........................................................................................e............................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v16.4S, v30.4S, v8.S[0] // ..............................................................................e......................................................................... + // gap // ........................................................................................................................................................ + mls v17.4S, v22.4S, v8.4S // ...................................................................................................................e.................................... + // gap // ........................................................................................................................................................ + srshr v20.4S, v4.4S, #23 // ......................................................................................................................e................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v22.4S, v23.4S, v28.4S // ....................................................................................e................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v12.4S, v12.4S, v24.S[1] // ............................................................................................e........................................................... + mls v6.4S, v1.4S, v8.S[0] // ...................................................................................e.................................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v1.4S, v22.4S, v26.S[2] // ......................................................................................e................................................................. + sqrdmulh v22.4S, v22.4S, v26.S[3] // .......................................................................................e................................................................ + mls v4.4S, v20.4S, v8.4S // .......................................................................................................................e................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v24.4S, v10.4S, v31.4S // ..............................................................................................e......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v26.4S, v16.4S, v6.4S // ....................................................................................................e................................................... + mls v25.4S, v12.4S, v8.S[0] // .............................................................................................e.......................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sub v30.4S, v16.4S, v6.4S // ...................................................................................................e.................................................... + mls v1.4S, v22.4S, v8.S[0] // ........................................................................................e............................................................... + sub v20.4S, v17.4S, v4.4S // ..........................................................................................................................e............................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + srshr v22.4S, v26.4S, #23 // ....................................................................................................................e................................... + mul v10.4S, v30.4S, v19.S[2] // .....................................................................................................e.................................................. + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v16.4S, v30.4S, v19.S[3] // ......................................................................................................e................................................. + sqrdmulh v12.4S, v20.4S, v19.S[1] // .............................................................................................................................e.......................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v13.4S, v20.4S, v19.S[0] // ............................................................................................................................e........................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + add v20.4S, v17.4S, v4.4S // ...........................................................................................................................e............................ + sub v4.4S, v1.4S, v25.4S // .............................................................................................................e.......................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mls v10.4S, v16.4S, v8.S[0] // .......................................................................................................e................................................ + add v31.4S, v1.4S, v25.4S // ..............................................................................................................e......................................... + mls v13.4S, v12.4S, v8.S[0] // ..............................................................................................................................e......................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + mul v1.4S, v4.4S, v29.S[0] // ...............................................................................................................e........................................ + srshr v12.4S, v31.4S, #23 // ........................................................................................................................e............................... + str q20, [x1], #(16*4) // ..............................................................................................................................................e......... + // gap // ........................................................................................................................................................ + sqrdmulh v30.4S, v24.4S, v19.S[3] // .................................................................................................e...................................................... + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + // gap // ........................................................................................................................................................ + sqrdmulh v20.4S, v4.4S, v29.S[1] // ................................................................................................................e....................................... + // gap // ........................................................................................................................................................ + str q13, [x2], #(16*4) // ..................................................................................................................................................e..... + mul v7.4S, v9.4S, v29.S[0] // ..........................................................................................................e............................................. + mul v4.4S, v24.4S, v19.S[2] // ................................................................................................e....................................................... + + // original source code + // ld4 {v9.4S, v10.4S, v11.4S, v12.4S}, [x1] // .........................e...........................................................................................................................|...........................e.............. + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x2] // ...........................................e.........................................................................................................|.......................................... + // ldr q0, [x5], #(12*16) // ...e.................................................................................................................................................|.....e.................................... + // ldr q4, [x5, #(-12*16 + 1*16)] // ............................e........................................................................................................................|..............................e........... + // ldr q1, [x5, #(-12*16 + 2*16)] // .............e.......................................................................................................................................|...............e.......................... + // ldr q5, [x5, #(-12*16 + 3*16)] // ...........e.........................................................................................................................................|.............e............................ + // ldr q2, [x5, #(-12*16 + 4*16)] // .........e...........................................................................................................................................|...........e.............................. + // ldr q6, [x5, #(-12*16 + 5*16)] // e....................................................................................................................................................|..e....................................... + // sub v24.4s, v9.4s, v10.4s // .....................................e...............................................................................................................|.......................................e.. + // add v9.4s, v9.4s, v10.4s // ..........................................e..........................................................................................................|.......................................... + // mul v10.4s, v24.4s, v1.4s // ............................................e........................................................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .............................................e.......................................................................................................|.......................................... + // mls v10.4s, v24.4s, v8.s[0] // ...................................................e.................................................................................................|.......................................... + // sub v24.4s, v11.4s, v12.4s // ........................................e............................................................................................................|.......................................... + // add v11.4s, v11.4s, v12.4s // .........................................e...........................................................................................................|.......................................... + // mul v12.4s, v24.4s, v2.4s // ...............................................e.....................................................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ..............................................e......................................................................................................|.......................................... + // mls v12.4s, v24.4s, v8.s[0] // ....................................................e................................................................................................|.......................................... + // sub v24.4s, v9.4s, v11.4s // ..................................................e..................................................................................................|.......................................... + // add v9.4s, v9.4s, v11.4s // .................................................e...................................................................................................|.......................................... + // mul v11.4s, v24.4s, v0.4s // ............................................................e........................................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ...................................................................e.................................................................................|.......................................... + // mls v11.4s, v24.4s, v8.s[0] // ..........................................................................e..........................................................................|.......................................... + // sub v24.4s, v10.4s, v12.4s // ..............................................................e......................................................................................|.......................................... + // add v10.4s, v10.4s, v12.4s // .........................................................................e...........................................................................|.......................................... + // mul v12.4s, v24.4s, v0.4s // .....................................................................e...............................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................................................e..................................................................................|.......................................... + // mls v12.4s, v24.4s, v8.s[0] // ...........................................................................e.........................................................................|.......................................... + // ldr q0, [x5, #(-12*16 + 6*16)] // ................................................e....................................................................................................|.......................................... + // ldr q4, [x5, #(-12*16 + 7*16)] // ..................e..................................................................................................................................|....................e..................... + // ldr q1, [x5, #(-12*16 + 8*16)] // ........e............................................................................................................................................|..........e............................... + // ldr q5, [x5, #(-12*16 + 9*16)] // .............................e.......................................................................................................................|...............................e.......... + // ldr q2, [x5, #(-12*16 + 10*16)] // ...............................e.....................................................................................................................|.................................e........ + // ldr q6, [x5, #(-12*16 + 11*16)] // .e...................................................................................................................................................|...e...................................... + // sub v24.4s, v13.4s, v14.4s // .......................................................e.............................................................................................|.......................................... + // add v13.4s, v13.4s, v14.4s // ..........................................................e..........................................................................................|.......................................... + // mul v14.4s, v24.4s, v1.4s // .............................................................e.......................................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v5.4s // .................................................................e...................................................................................|.......................................... + // mls v14.4s, v24.4s, v8.s[0] // ........................................................................e............................................................................|.......................................... + // sub v24.4s, v15.4s, v16.4s // ......................................................e..............................................................................................|.......................................... + // add v15.4s, v15.4s, v16.4s // .........................................................e...........................................................................................|.......................................... + // mul v16.4s, v24.4s, v2.4s // ...............................................................e.....................................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v6.4s // ...........................................................e.........................................................................................|.......................................... + // mls v16.4s, v24.4s, v8.s[0] // .......................................................................e.............................................................................|.......................................... + // sub v24.4s, v13.4s, v15.4s // ................................................................e....................................................................................|.......................................... + // add v13.4s, v13.4s, v15.4s // ..............................................................................e......................................................................|.......................................... + // mul v15.4s, v24.4s, v0.4s // ......................................................................e..............................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ....................................................................e................................................................................|.......................................... + // mls v15.4s, v24.4s, v8.s[0] // ............................................................................e........................................................................|.......................................... + // sub v24.4s, v14.4s, v16.4s // .............................................................................e.......................................................................|.......................................... + // add v14.4s, v14.4s, v16.4s // ................................................................................e....................................................................|.......................................... + // mul v16.4s, v24.4s, v0.4s // .................................................................................e...................................................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v4.4s // ..................................................................................e..................................................................|.......................................... + // mls v16.4s, v24.4s, v8.s[0] // .......................................................................................e.............................................................|.......................................... + // trn1 v25.4s, v9.4s, v10.4s // .....................................................................................e...............................................................|.......................................... + // trn2 v26.4s, v9.4s, v10.4s // ...............................................................................e.....................................................................|.......................................... + // trn1 v27.4s, v11.4s, v12.4s // ......................................................................................e..............................................................|.......................................... + // trn2 v28.4s, v11.4s, v12.4s // ...................................................................................e.................................................................|.......................................... + // trn2 v11.2d, v25.2d, v27.2d // ...........................................................................................e.........................................................|.......................................... + // trn2 v12.2d, v26.2d, v28.2d // ........................................................................................e............................................................|.......................................... + // trn1 v9.2d, v25.2d, v27.2d // .........................................................................................e...........................................................|.......................................... + // trn1 v10.2d, v26.2d, v28.2d // ..........................................................................................e..........................................................|.......................................... + // trn1 v25.4s, v13.4s, v14.4s // ............................................................................................e........................................................|.......................................... + // trn2 v26.4s, v13.4s, v14.4s // ....................................................................................e................................................................|.......................................... + // trn1 v27.4s, v15.4s, v16.4s // .............................................................................................e.......................................................|.......................................... + // trn2 v28.4s, v15.4s, v16.4s // ..............................................................................................e......................................................|.......................................... + // trn2 v15.2d, v25.2d, v27.2d // .................................................................................................e...................................................|.......................................... + // trn2 v16.2d, v26.2d, v28.2d // ..................................................................................................e..................................................|.......................................... + // trn1 v13.2d, v25.2d, v27.2d // ...................................................................................................e.................................................|.......................................... + // trn1 v14.2d, v26.2d, v28.2d // ....................................................................................................e................................................|.......................................... + // ldr q0, [x4], #64 // .....................................................e...............................................................................................|.......................................... + // ldr q1, [x4, #(-64 + 16)] // ..................................e..................................................................................................................|....................................e..... + // ldr q2, [x4, #(-64 + 32)] // ........................................................e............................................................................................|.......................................... + // ldr q3, [x4, #(-64 + 48)] // .....................................................................................................e...............................................|.......................................... + // sub v24.4s, v9.4s, v10.4s // ......................................................................................................e..............................................|.......................................... + // add v9.4s, v9.4s, v10.4s // ...............................................................................................e.....................................................|.......................................... + // mul v10.4s, v24.4s, v1.s[2] // .............................................................................................................e.......................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ..............................................................................................................e......................................|.......................................... + // mls v10.4s, v24.4s, v8.s[0] // ....................................................................................................................e................................|.......................................... + // sub v24.4s, v11.4s, v12.4s // ...........................................................................................................e.........................................|.......................................... + // add v11.4s, v11.4s, v12.4s // ................................................................................................e....................................................|.......................................... + // mul v12.4s, v24.4s, v2.s[0] // .................................................................................................................e...................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................................................e....................................|.......................................... + // mls v12.4s, v24.4s, v8.s[0] // .........................................................................................................................e...........................|.......................................... + // sub v24.4s, v13.4s, v14.4s // .......................................................................................................................e.............................|.......................................... + // add v13.4s, v13.4s, v14.4s // ........................................................................................................e............................................|.......................................... + // mul v14.4s, v24.4s, v2.s[2] // ..........................................................................................................................e..........................|.......................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ...........................................................................................................................e.........................|.......................................... + // mls v14.4s, v24.4s, v8.s[0] // .................................................................................................................................e...................|.......................................... + // sub v24.4s, v15.4s, v16.4s // ..........................................................................................................e..........................................|.......................................... + // add v15.4s, v15.4s, v16.4s // .......................................................................................................e.............................................|.......................................... + // mul v16.4s, v24.4s, v3.s[0] // ...................................................................................................................e.................................|.......................................... + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ........................................................................................................................e............................|.......................................... + // mls v16.4s, v24.4s, v8.s[0] // ...............................................................................................................................e.....................|.......................................... + // sub v24.4s, v9.4s, v11.4s // .............................................................................................................................e.......................|.......................................... + // add v9.4s, v9.4s, v11.4s // .........................................................................................................e...........................................|.......................................... + // mul v11.4s, v24.4s, v0.s[2] // ....................................................................................................................................................e|.......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // ................................................................................................................................................e....|.......................................... + // mls v11.4s, v24.4s, v8.s[0] // .....*...............................................................................................................................................|.......*.................................. + // sub v24.4s, v10.4s, v12.4s // ................................................................................................................................e....................|.......................................... + // add v10.4s, v10.4s, v12.4s // ..............................................................................................................................e......................|.......................................... + // mul v12.4s, v24.4s, v0.s[2] // ....................................................................................................................................e................|.......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....................................................................................................................................e...............|.......................................... + // mls v12.4s, v24.4s, v8.s[0] // ..........................................................................................................................................e..........|.......................................... + // sub v24.4s, v13.4s, v15.4s // ............................................................................................................e........................................|.......................................... + // add v13.4s, v13.4s, v15.4s // ..................................................................................................................e..................................|.......................................... + // mul v15.4s, v24.4s, v1.s[0] // ...................................................................................................................................................e.|.......................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .....................................................................................................................................................*.......................................... + // mls v15.4s, v24.4s, v8.s[0] // ....*................................................................................................................................................|......*................................... + // sub v24.4s, v14.4s, v16.4s // .........................................................................................................................................e...........|.......................................... + // add v14.4s, v14.4s, v16.4s // ...........................................................................................................................................e.........|.......................................... + // mul v16.4s, v24.4s, v1.s[0] // .............................................................................................................................................e.......|.......................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // .................................................................................................................................................e...|.......................................... + // mls v16.4s, v24.4s, v8.s[0] // ..*..................................................................................................................................................|....*..................................... + // srshr v24.4S, v9.4S, #23 // ...............................................................................................................e.....................................|.......................................... + // mls v9.4s, v24.4s, v8.4s // .....................................................................................................................e...............................|.......................................... + // srshr v24.4S, v10.4S, #23 // ...................................................................................................................................e.................|.......................................... + // mls v10.4s, v24.4s, v8.4s // .....................................................................................................................................................|*......................................... + // srshr v24.4S, v13.4S, #23 // ......................................................................................................................e..............................|.......................................... + // mls v13.4s, v24.4s, v8.4s // ............................................................................................................................e........................|.......................................... + // srshr v24.4S, v14.4S, #23 // ..............................................................................................................................................e......|.......................................... + // mls v14.4s, v24.4s, v8.4s // .....................................................................................................................................................|.*........................................ + // sub v24.4s, v9.4s, v13.4s // ..................................................................................................................................e..................|.......................................... + // add v9.4s, v9.4s, v13.4s // ........................................................................................................................................e............|.......................................... + // mul v13.4s, v24.4s, v0.s[0] // .......................................................................................................................................e.............|.......................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ......................................................................................................................................e..............|.......................................... + // mls v13.4s, v24.4s, v8.s[0] // ............................................................................................................................................e........|.......................................... + // sub v24.4s, v10.4s, v14.4s // ......*..............................................................................................................................................|........*................................. + // add v10.4s, v10.4s, v14.4s // .......*.............................................................................................................................................|.........*................................ + // mul v14.4s, v24.4s, v0.s[0] // ...........................*.........................................................................................................................|.............................*............ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................*..........................................................................................................................|............................*............. + // mls v14.4s, v24.4s, v8.s[0] // .................................*...................................................................................................................|...................................*...... + // sub v24.4s, v11.4s, v15.4s // ...............*.....................................................................................................................................|.................*........................ + // add v11.4s, v11.4s, v15.4s // ..............*......................................................................................................................................|................*......................... + // mul v15.4s, v24.4s, v0.s[0] // ......................*..............................................................................................................................|........................*................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .....................*...............................................................................................................................|.......................*.................. + // mls v15.4s, v24.4s, v8.s[0] // ................................*....................................................................................................................|..................................*....... + // sub v24.4s, v12.4s, v16.4s // ............*........................................................................................................................................|..............*........................... + // add v12.4s, v12.4s, v16.4s // ..........*..........................................................................................................................................|............*............................. + // mul v16.4s, v24.4s, v0.s[0] // ...................*.................................................................................................................................|.....................*.................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ....................*................................................................................................................................|......................*................... + // mls v16.4s, v24.4s, v8.s[0] // ..............................*......................................................................................................................|................................*......... + // str q9, [x1], #(16*4) // ...............................................................................................................................................e.....|.......................................... + // str q10, [x1, #(-16*4 + 1*16)] // ................*....................................................................................................................................|..................*....................... + // str q11, [x1, #(-16*4 + 2*16)] // .......................*.............................................................................................................................|.........................*................ + // str q12, [x1, #(-16*4 + 3*16)] // .................*...................................................................................................................................|...................*...................... + // str q13, [x2], #(16*4) // ..................................................................................................................................................e..|.......................................... + // str q14, [x2, #(-16*4 + 1*16)] // ......................................*..............................................................................................................|........................................*. + // str q15, [x2, #(-16*4 + 2*16)] // ....................................*................................................................................................................|......................................*... + // str q16, [x2, #(-16*4 + 3*16)] // ...................................*.................................................................................................................|.....................................*.... + // add x1, x1, #64 // ........................*............................................................................................................................|..........................*............... + // add x2, x2, #64 // .......................................*.............................................................................................................|.........................................* + + sub count, count, #1 + cbnz count, layer45678_start + sqrdmulh v13.4S, v9.4S, v29.S[1] // *............................ + mls v1.4S, v20.4S, v8.S[0] // ...*......................... + // gap // ............................. + // gap // ............................. + mls v26.4S, v22.4S, v8.4S // .*........................... + mls v31.4S, v12.4S, v8.4S // ..*.......................... + // gap // ............................. + // gap // ............................. + mls v4.4S, v30.4S, v8.S[0] // .....*....................... + // gap // ............................. + // gap // ............................. + // gap // ............................. + mls v7.4S, v13.4S, v8.S[0] // ....*........................ + sub v18.4S, v10.4S, v1.4S // .........*................... + // gap // ............................. + // gap // ............................. + sub v28.4S, v26.4S, v31.4S // ......*...................... + add v12.4S, v10.4S, v1.4S // ........*.................... + // gap // ............................. + // gap // ............................. + add v2.4S, v26.4S, v31.4S // .......*..................... + mul v1.4S, v18.4S, v19.S[0] // ..............*.............. + // gap // ............................. + // gap // ............................. + sub v22.4S, v4.4S, v7.4S // ...........*................. + str q12, [x1, #-16] // .............*............... + sqrdmulh v13.4S, v18.4S, v19.S[1] // ...............*............. + // gap // ............................. + sqrdmulh v20.4S, v28.4S, v19.S[1] // ....................*........ + mul v12.4S, v28.4S, v19.S[0] // .....................*....... + str q2, [x1, #-48] // ............*................ + // gap // ............................. + mul v18.4S, v22.4S, v19.S[0] // .................*........... + sqrdmulh v22.4S, v22.4S, v19.S[1] // ................*............ + // gap // ............................. + // gap // ............................. + add v7.4S, v4.4S, v7.4S // ..........*.................. + mls v1.4S, v13.4S, v8.S[0] // ......................*...... + // gap // ............................. + // gap // ............................. + mls v12.4S, v20.4S, v8.S[0] // ........................*.... + // gap // ............................. + // gap // ............................. + // gap // ............................. + str q7, [x1, #-32] // ..................*.......... + add x1, x1, #64 // ...................*......... + mls v18.4S, v22.4S, v8.S[0] // .......................*..... + // gap // ............................. + str q1, [x2, #-16] // .........................*... + // gap // ............................. + // gap // ............................. + // gap // ............................. + str q12, [x2, #-48] // ...........................*. + // gap // ............................. + // gap // ............................. + // gap // ............................. + str q18, [x2, #-32] // ..........................*.. + add x2, x2, #64 // ............................* + // gap // ............................. + // gap // ............................. + + // original source code + // sqrdmulh v13.4S, v9.4S, v29.S[1] // *............................ + // mls v26.4S, v22.4S, v8.4S // ..*.......................... + // mls v31.4S, v12.4S, v8.4S // ...*......................... + // mls v1.4S, v20.4S, v8.S[0] // .*........................... + // mls v7.4S, v13.4S, v8.S[0] // .....*....................... + // mls v4.4S, v30.4S, v8.S[0] // ....*........................ + // sub v30.4S, v26.4S, v31.4S // .......*..................... + // add v22.4S, v26.4S, v31.4S // .........*................... + // add v12.4S, v10.4S, v1.4S // ........*.................... + // sub v1.4S, v10.4S, v1.4S // ......*...................... + // add v9.4S, v4.4S, v7.4S // ...................*......... + // sub v4.4S, v4.4S, v7.4S // ...........*................. + // str q22, [x1, #-48] // ................*............ + // str q12, [x1, #-16] // ............*................ + // mul v20.4S, v1.4S, v19.S[0] // ..........*.................. + // sqrdmulh v12.4S, v1.4S, v19.S[1] // .............*............... + // sqrdmulh v31.4S, v4.4S, v19.S[1] // ..................*.......... + // mul v13.4S, v4.4S, v19.S[0] // .................*........... + // str q9, [x1, #-32] // ......................*...... + // add x1, x1, #64 // .......................*..... + // sqrdmulh v22.4S, v30.4S, v19.S[1] // ..............*.............. + // mul v1.4S, v30.4S, v19.S[0] // ...............*............. + // mls v20.4S, v12.4S, v8.S[0] // ....................*........ + // mls v13.4S, v31.4S, v8.S[0] // ........................*.... + // mls v1.4S, v22.4S, v8.S[0] // .....................*....... + // str q20, [x2, #-16] // .........................*... + // str q13, [x2, #-32] // ...........................*. + // str q1, [x2, #-48] // ..........................*.. + // add x2, x2, #64 // ............................* + + +// ----------------------------------------------------------------------------- + + ninv .req v25 + ninv_tw .req v26 + modulus_half .req v30 + neg_modulus_half .req v31 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.4s}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.4s}, [xtmp] + + ushr modulus_half.4S, consts.4S, #1 + neg neg_modulus_half.4S, modulus_half.4S + + mov count, #8 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + ldr q15, [x0, #0] // *................. + // gap // .................. + // gap // .................. + ldr q5, [x0, #128] // .*................ + ldr q9, [x0, #256] // ....*............. + ldr q17, [x0, #384] // .....*............ + // gap // .................. + // gap // .................. + ldr q10, [x0, #512] // ......*........... + // gap // .................. + // gap // .................. + // gap // .................. + ldr q23, [x0, #768] // ........*......... + // gap // .................. + // gap // .................. + // gap // .................. + sub v27.4S, v15.4S, v5.4S // ..*............... + ldr q6, [x0, #896] // .........*........ + // gap // .................. + // gap // .................. + add v14.4S, v15.4S, v5.4S // ...............*.. + ldr q18, [x0, #640] // .......*.......... + // gap // .................. + sub v12.4S, v9.4S, v17.4S // ..........*....... + sqrdmulh v5.4S, v27.4S, v1.S[3] // ...........*...... + mul v16.4S, v27.4S, v1.S[2] // ...*.............. + // gap // .................. + // gap // .................. + add v27.4S, v9.4S, v17.4S // ..............*... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + add v28.4S, v23.4S, v6.4S // .................* + mul v13.4S, v12.4S, v2.S[0] // .............*.... + mls v16.4S, v5.4S, v8.S[0] // ................*. + // gap // .................. + // gap // .................. + sub v17.4S, v10.4S, v18.4S // ............*..... + + // original source code + // ldr q11, [x0, #0] // *................. + // ldr q7, [x0, #128] // .*................ + // sub v19.4S, v11.4S, v7.4S // ......*........... + // mul v16.4S, v19.4S, v1.S[2] // ............*..... + // ldr q24, [x0, #256] // ..*............... + // ldr q14, [x0, #384] // ...*.............. + // ldr q10, [x0, #512] // ....*............. + // ldr q18, [x0, #640] // .........*........ + // ldr q23, [x0, #768] // .....*............ + // ldr q6, [x0, #896] // .......*.......... + // sub v12.4S, v24.4S, v14.4S // ..........*....... + // sqrdmulh v22.4S, v19.4S, v1.S[3] // ...........*...... + // sub v17.4S, v10.4S, v18.4S // .................* + // mul v13.4S, v12.4S, v2.S[0] // ...............*.. + // add v27.4S, v24.4S, v14.4S // .............*.... + // add v14.4S, v11.4S, v7.4S // ........*......... + // mls v16.4S, v22.4S, v8.S[0] // ................*. + // add v28.4S, v23.4S, v6.4S // ..............*... + + sub count, count, #1 +layer123_start: + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v7.4S, v12.4S, v2.S[1] // ................*....................................................................................................... + sub v9.4S, v23.4S, v6.4S // .......................*................................................................................................ + // gap // ........................................................................................................................ + mul v24.4S, v17.4S, v2.S[2] // ....................*................................................................................................... + sqrdmulh v17.4S, v17.4S, v2.S[3] // .....................*.................................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v20.4S, v9.4S, v3.S[0] // .........................*.............................................................................................. + add v15.4S, v10.4S, v18.4S // ...................*.................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v7.4S, v8.S[0] // .................*...................................................................................................... + sub v23.4S, v14.4S, v27.4S // ............................*........................................................................................... + ldr q11, [x0, #16] // e....................................................................................................................... + ldr q7, [x0, #144] // .e...................................................................................................................... + mls v24.4S, v17.4S, v8.S[0] // ......................*................................................................................................. + sqrdmulh v18.4S, v9.4S, v3.S[1] // ..........................*............................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v22.4S, v23.4S, v0.S[2] // ..............................*......................................................................................... + sqrdmulh v5.4S, v23.4S, v0.S[3] // ...............................*........................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v12.4S, v15.4S, v28.4S // ......................................*................................................................................. + sub v17.4S, v16.4S, v13.4S // .................................*...................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v20.4S, v18.4S, v8.S[0] // ...........................*............................................................................................ + add v10.4S, v14.4S, v27.4S // .............................*.......................................................................................... + // gap // ........................................................................................................................ + mls v22.4S, v5.4S, v8.S[0] // ................................*....................................................................................... + // gap // ........................................................................................................................ + sub v19.4S, v11.4S, v7.4S // ........e............................................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v9.4S, v17.4S, v0.S[2] // ...................................*.................................................................................... + sqrdmulh v18.4S, v17.4S, v0.S[3] // ....................................*................................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v23.4S, v12.4S, v1.S[0] // ........................................*............................................................................... + sqrdmulh v29.4S, v12.4S, v1.S[1] // .........................................*.............................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v14.4S, v24.4S, v20.4S // ...........................................*............................................................................ + add v4.4S, v16.4S, v13.4S // ..................................*..................................................................................... + mul v16.4S, v19.4S, v1.S[2] // ..........e............................................................................................................. + // gap // ........................................................................................................................ + mls v9.4S, v18.4S, v8.S[0] // .....................................*.................................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v23.4S, v29.4S, v8.S[0] // ..........................................*............................................................................. + add v18.4S, v24.4S, v20.4S // ............................................*........................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v20.4S, v15.4S, v28.4S // .......................................*................................................................................ + mul v13.4S, v14.4S, v1.S[0] // .............................................*.......................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v15.4S, v14.4S, v1.S[1] // ..............................................*......................................................................... + sub v17.4S, v4.4S, v18.4S // .....................................................*.................................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v14.4S, v10.4S, v20.4S // .................................................*...................................................................... + sub v29.4S, v10.4S, v20.4S // ................................................*....................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v12.4S, v17.4S, v0.S[0] // .......................................................*................................................................ + sqrdmulh v20.4S, v17.4S, v0.S[1] // ........................................................*............................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + add v27.4S, v4.4S, v18.4S // ......................................................*................................................................. + mul v18.4S, v29.4S, v0.S[0] // ..................................................*..................................................................... + // gap // ........................................................................................................................ + mls v13.4S, v15.4S, v8.S[0] // ...............................................*........................................................................ + // gap // ........................................................................................................................ + sqrdmulh v24.4S, v29.4S, v0.S[1] // ...................................................*.................................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v12.4S, v20.4S, v8.S[0] // .........................................................*.............................................................. + sub v4.4S, v22.4S, v23.4S // ..........................................................*............................................................. + // gap // ........................................................................................................................ + sqrdmulh v5.4S, v14.4S, v26.4S // .........................................................................................*.............................. + // gap // ........................................................................................................................ + add v17.4S, v22.4S, v23.4S // ...........................................................*............................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v20.4S, v4.4S, v0.S[0] // ............................................................*........................................................... + sub v28.4S, v9.4S, v13.4S // ...............................................................*........................................................ + // gap // ........................................................................................................................ + mls v18.4S, v24.4S, v8.S[0] // ....................................................*................................................................... + // gap // ........................................................................................................................ + add v23.4S, v9.4S, v13.4S // ................................................................*....................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v6.4S, v4.4S, v0.S[1] // .............................................................*.......................................................... + mul v22.4S, v28.4S, v0.S[0] // .................................................................*...................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v29.4S, v17.4S, v26.4S // ...............................................................................................*........................ + mul v13.4S, v14.4S, v25.4S // ........................................................................................*............................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v15.4S, v31.4S, v18.4S // ....................................................................*................................................... + cmge v21.4S, v18.4S, v30.4S // .....................................................................*.................................................. + // gap // ........................................................................................................................ + mls v20.4S, v6.4S, v8.S[0] // ..............................................................*......................................................... + // gap // ........................................................................................................................ + sqrdmulh v9.4S, v28.4S, v0.S[1] // ..................................................................*..................................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sqrdmulh v28.4S, v23.4S, v26.4S // ..................................................................................................*..................... + sub v24.4S, v15.4S, v21.4S // ......................................................................*................................................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v4.4S, v12.4S, v30.4S // .........................................................................*.............................................. + cmge v6.4S, v31.4S, v12.4S // ........................................................................*............................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v21.4S, v31.4S, v20.4S // ............................................................................*........................................... + cmge v10.4S, v20.4S, v30.4S // .............................................................................*.......................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v22.4S, v9.4S, v8.S[0] // ...................................................................*.................................................... + mls v18.4S, v24.4S, v8.4S // .......................................................................*................................................ + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + sub v15.4S, v6.4S, v4.4S // ..........................................................................*............................................. + sub v21.4S, v21.4S, v10.4S // ..............................................................................*......................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v4.4S, v27.4S, v25.4S // ...........................................................................................*............................ + sqrdmulh v14.4S, v27.4S, v26.4S // ............................................................................................*........................... + cmge v24.4S, v22.4S, v30.4S // .................................................................................*...................................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mul v9.4S, v17.4S, v25.4S // ..............................................................................................*......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v6.4S, v31.4S, v22.4S // ................................................................................*....................................... + mls v20.4S, v21.4S, v8.4S // ...............................................................................*........................................ + mls v4.4S, v14.4S, v8.S[0] // .............................................................................................*.......................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v13.4S, v5.4S, v8.S[0] // ..........................................................................................*............................. + // gap // ........................................................................................................................ + mls v12.4S, v15.4S, v8.4S // ...........................................................................*............................................ + mls v9.4S, v29.4S, v8.S[0] // ................................................................................................*....................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + str q20, [x0, #768] // ......................................................................................*................................. + mul v20.4S, v23.4S, v25.4S // .................................................................................................*...................... + sub v23.4S, v6.4S, v24.4S // ..................................................................................*..................................... + cmge v29.4S, v31.4S, v13.4S // ....................................................................................................*................... + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v21.4S, v13.4S, v30.4S // .....................................................................................................*.................. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + cmge v15.4S, v31.4S, v4.4S // ........................................................................................................*............... + cmge v5.4S, v4.4S, v30.4S // .........................................................................................................*.............. + // gap // ........................................................................................................................ + // gap // ........................................................................................................................ + mls v20.4S, v28.4S, v8.S[0] // ...................................................................................................*.................... + sub v29.4S, v29.4S, v21.4S // ......................................................................................................*................. + // gap // ........................................................................................................................ + ldr q24, [x0, #272] // ..e..................................................................................................................... + mls v22.4S, v23.4S, v8.4S // ...................................................................................*.................................... + sub v21.4S, v15.4S, v5.4S // ..........................................................................................................*............. + ldr q14, [x0, #400] // ...e.................................................................................................................... + str q18, [x0, #512] // ....................................................................................*................................... + cmge v27.4S, v31.4S, v9.4S // ............................................................................................................*........... + cmge v28.4S, v9.4S, v30.4S // .............................................................................................................*.......... + ldr q10, [x0, #528] // ....e................................................................................................................... + ldr q18, [x0, #656] // .....e.................................................................................................................. + cmge v17.4S, v20.4S, v30.4S // .................................................................................................................*...... + cmge v6.4S, v31.4S, v20.4S // ................................................................................................................*....... + sub v15.4S, v27.4S, v28.4S // ..............................................................................................................*......... + str q12, [x0, #640] // .....................................................................................*.................................. + mls v13.4S, v29.4S, v8.4S // .......................................................................................................*................ + // gap // ........................................................................................................................ + mls v4.4S, v21.4S, v8.4S // ...........................................................................................................*............ + ldr q23, [x0, #784] // ......e................................................................................................................. + // gap // ........................................................................................................................ + sub v5.4S, v6.4S, v17.4S // ..................................................................................................................*..... + mls v9.4S, v15.4S, v8.4S // ...............................................................................................................*........ + ldr q6, [x0, #912] // .......e................................................................................................................ + str q22, [x0, #896] // .......................................................................................*................................ + sub v12.4S, v24.4S, v14.4S // .............e.......................................................................................................... + sqrdmulh v22.4S, v19.4S, v1.S[3] // ...........e............................................................................................................ + // gap // ........................................................................................................................ + mls v20.4S, v5.4S, v8.4S // ...................................................................................................................*.... + str q13, [x0], #(16) // ....................................................................................................................*... + sub v17.4S, v10.4S, v18.4S // ..................e..................................................................................................... + mul v13.4S, v12.4S, v2.S[0] // ...............e........................................................................................................ + // gap // ........................................................................................................................ + str q4, [x0, #112] // .....................................................................................................................*.. + add v27.4S, v24.4S, v14.4S // ..............e......................................................................................................... + add v14.4S, v11.4S, v7.4S // .........e.............................................................................................................. + // gap // ........................................................................................................................ + str q9, [x0, #240] // ......................................................................................................................*. + mls v16.4S, v22.4S, v8.S[0] // ............e........................................................................................................... + // gap // ........................................................................................................................ + str q20, [x0, #368] // .......................................................................................................................* + add v28.4S, v23.4S, v6.4S // ........................e............................................................................................... + + // original source code + // ldr q9, [x0, #0] // e...............................................................................................................|.......e.............................................................................................................. + // ldr q10, [x0, #(1*(1024/8))] // .e..............................................................................................................|........e............................................................................................................. + // ldr q11, [x0, #(2*(1024/8))] // ...............................................................................e................................|......................................................................................e............................... + // ldr q12, [x0, #(3*(1024/8))] // ..................................................................................e.............................|.........................................................................................e............................ + // ldr q13, [x0, #(4*(1024/8))] // ......................................................................................e.........................|.............................................................................................e........................ + // ldr q14, [x0, #(5*(1024/8))] // .......................................................................................e........................|..............................................................................................e....................... + // ldr q15, [x0, #(6*(1024/8))] // ..............................................................................................e.................|.....................................................................................................e................ + // ldr q16, [x0, #(7*(1024/8))] // .................................................................................................e..............|........................................................................................................e............. + // sub v24.4s, v9.4s, v10.4s // ...........e....................................................................................................|..................e................................................................................................... + // add v9.4s, v9.4s, v10.4s // ...........................................................................................................e....|..................................................................................................................e... + // mul v10.4s, v24.4s, v1.s[2] // ..................e.............................................................................................|.........................e............................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[3] // ....................................................................................................e...........|...........................................................................................................e.......... + // mls v10.4s, v24.4s, v8.s[0] // .............................................................................................................e..|....................................................................................................................e. + // sub v24.4s, v11.4s, v12.4s // ...................................................................................................e............|..........................................................................................................e........... + // add v11.4s, v11.4s, v12.4s // ..........................................................................................................e.....|.................................................................................................................e.... + // mul v12.4s, v24.4s, v2.s[0] // ........................................................................................................e.......|...............................................................................................................e...... + // sqrdmulh v24.4s, v24.4s, v2.s[1] // ................................................................................................................*...................................................................................................................... + // mls v12.4s, v24.4s, v8.s[0] // ................................................................................................................|.....*................................................................................................................ + // sub v24.4s, v13.4s, v14.4s // .......................................................................................................e........|..............................................................................................................e....... + // add v13.4s, v13.4s, v14.4s // ................................................................................................................|....*................................................................................................................. + // mul v14.4s, v24.4s, v2.s[2] // ................................................................................................................|.*.................................................................................................................... + // sqrdmulh v24.4s, v24.4s, v2.s[3] // ................................................................................................................|..*................................................................................................................... + // mls v14.4s, v24.4s, v8.s[0] // ..*.............................................................................................................|.........*............................................................................................................ + // sub v24.4s, v15.4s, v16.4s // ................................................................................................................|*..................................................................................................................... + // add v15.4s, v15.4s, v16.4s // ...............................................................................................................e|...................................................................................................................... + // mul v16.4s, v24.4s, v3.s[0] // ................................................................................................................|...*.................................................................................................................. + // sqrdmulh v24.4s, v24.4s, v3.s[1] // ...*............................................................................................................|..........*........................................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ........*.......................................................................................................|...............*...................................................................................................... + // sub v24.4s, v9.4s, v11.4s // ................................................................................................................|......*............................................................................................................... + // add v9.4s, v9.4s, v11.4s // .........*......................................................................................................|................*..................................................................................................... + // mul v11.4s, v24.4s, v0.s[2] // ....*...........................................................................................................|...........*.......................................................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .....*..........................................................................................................|............*......................................................................................................... + // mls v11.4s, v24.4s, v8.s[0] // ..........*.....................................................................................................|.................*.................................................................................................... + // sub v24.4s, v10.4s, v12.4s // .......*........................................................................................................|..............*....................................................................................................... + // add v10.4s, v10.4s, v12.4s // .................*..............................................................................................|........................*............................................................................................. + // mul v12.4s, v24.4s, v0.s[2] // ............*...................................................................................................|...................*.................................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[3] // .............*..................................................................................................|....................*................................................................................................. + // mls v12.4s, v24.4s, v8.s[0] // ...................*............................................................................................|..........................*........................................................................................... + // sub v24.4s, v13.4s, v15.4s // ......*.........................................................................................................|.............*........................................................................................................ + // add v13.4s, v13.4s, v15.4s // ......................*.........................................................................................|.............................*........................................................................................ + // mul v15.4s, v24.4s, v1.s[0] // ..............*.................................................................................................|.....................*................................................................................................ + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ...............*................................................................................................|......................*............................................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ....................*...........................................................................................|...........................*.......................................................................................... + // sub v24.4s, v14.4s, v16.4s // ................*...............................................................................................|.......................*.............................................................................................. + // add v14.4s, v14.4s, v16.4s // .....................*..........................................................................................|............................*......................................................................................... + // mul v16.4s, v24.4s, v1.s[0] // .......................*........................................................................................|..............................*....................................................................................... + // sqrdmulh v24.4s, v24.4s, v1.s[1] // ........................*.......................................................................................|...............................*...................................................................................... + // mls v16.4s, v24.4s, v8.s[0] // ................................*...............................................................................|.......................................*.............................................................................. + // sub v24.4s, v9.4s, v13.4s // ...........................*....................................................................................|..................................*................................................................................... + // add v9.4s, v9.4s, v13.4s // ..........................*.....................................................................................|.................................*.................................................................................... + // mul v13.4s, v24.4s, v0.s[0] // ...............................*................................................................................|......................................*............................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................*..............................................................................|........................................*............................................................................. + // mls v13.4s, v24.4s, v8.s[0] // ........................................*.......................................................................|...............................................*...................................................................... + // sub v24.4s, v10.4s, v14.4s // .........................*......................................................................................|................................*..................................................................................... + // add v10.4s, v10.4s, v14.4s // ..............................*.................................................................................|.....................................*................................................................................ + // mul v14.4s, v24.4s, v0.s[0] // ............................*...................................................................................|...................................*.................................................................................. + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .............................*..................................................................................|....................................*................................................................................. + // mls v14.4s, v24.4s, v8.s[0] // ..................................*.............................................................................|.........................................*............................................................................ + // sub v24.4s, v11.4s, v15.4s // ...................................*............................................................................|..........................................*........................................................................... + // add v11.4s, v11.4s, v15.4s // .....................................*..........................................................................|............................................*......................................................................... + // mul v15.4s, v24.4s, v0.s[0] // ......................................*.........................................................................|.............................................*........................................................................ + // sqrdmulh v24.4s, v24.4s, v0.s[1] // ..........................................*.....................................................................|.................................................*.................................................................... + // mls v15.4s, v24.4s, v8.s[0] // ................................................*...............................................................|.......................................................*.............................................................. + // sub v24.4s, v12.4s, v16.4s // .......................................*........................................................................|..............................................*....................................................................... + // add v12.4s, v12.4s, v16.4s // .........................................*......................................................................|................................................*..................................................................... + // mul v16.4s, v24.4s, v0.s[0] // ...........................................*....................................................................|..................................................*................................................................... + // sqrdmulh v24.4s, v24.4s, v0.s[1] // .................................................*..............................................................|........................................................*............................................................. + // mls v16.4s, v24.4s, v8.s[0] // ........................................................*.......................................................|...............................................................*...................................................... + // cmge v27.4s, v31.4s, v13.4s // ..............................................*.................................................................|.....................................................*................................................................ + // cmge v28.4s, v13.4s, v30.4s // ...............................................*................................................................|......................................................*............................................................... + // sub v28.4s, v27.4s, v28.4s // ...................................................*............................................................|..........................................................*........................................................... + // mls v13.4s, v28.4s, v8.4s // .........................................................*......................................................|................................................................*..................................................... + // cmge v27.4s, v31.4s, v14.4s // .....................................................*..........................................................|............................................................*......................................................... + // cmge v28.4s, v14.4s, v30.4s // ....................................................*...........................................................|...........................................................*.......................................................... + // sub v28.4s, v27.4s, v28.4s // ..........................................................*.....................................................|.................................................................*.................................................... + // mls v14.4s, v28.4s, v8.4s // ....................................................................*...........................................|...........................................................................*.......................................... + // cmge v27.4s, v31.4s, v15.4s // ......................................................*.........................................................|.............................................................*........................................................ + // cmge v28.4s, v15.4s, v30.4s // .......................................................*........................................................|..............................................................*....................................................... + // sub v28.4s, v27.4s, v28.4s // ...........................................................*....................................................|..................................................................*................................................... + // mls v15.4s, v28.4s, v8.4s // .................................................................*..............................................|........................................................................*............................................. + // cmge v27.4s, v31.4s, v16.4s // ................................................................*...............................................|.......................................................................*.............................................. + // cmge v28.4s, v16.4s, v30.4s // ..............................................................*.................................................|.....................................................................*................................................ + // sub v28.4s, v27.4s, v28.4s // ........................................................................*.......................................|...............................................................................*...................................... + // mls v16.4s, v28.4s, v8.4s // ................................................................................*...............................|.......................................................................................*.............................. + // str q13, [x0, #(4*(1024/8))] // ...................................................................................*............................|..........................................................................................*........................... + // str q14, [x0, #(5*(1024/8))] // ...........................................................................................*....................|..................................................................................................*................... + // str q15, [x0, #(6*(1024/8))] // ......................................................................*.........................................|.............................................................................*........................................ + // str q16, [x0, #(7*(1024/8))] // ..................................................................................................*.............|.........................................................................................................*............ + // mul v13.4s, v9.4s, v25.4s // .............................................*..................................................................|....................................................*................................................................. + // sqrdmulh v9.4s, v9.4s, v26.4s // ....................................*...........................................................................|...........................................*.......................................................................... + // mls v13.4s, v9.4s, v8.s[0] // ...................................................................*............................................|..........................................................................*........................................... + // mul v14.4s, v10.4s, v25.4s // ............................................................*...................................................|...................................................................*.................................................. + // sqrdmulh v10.4s, v10.4s, v26.4s // .............................................................*..................................................|....................................................................*................................................. + // mls v14.4s, v10.4s, v8.s[0] // ..................................................................*.............................................|.........................................................................*............................................ + // mul v15.4s, v11.4s, v25.4s // ...............................................................*................................................|......................................................................*............................................... + // sqrdmulh v11.4s, v11.4s, v26.4s // ............................................*...................................................................|...................................................*.................................................................. + // mls v15.4s, v11.4s, v8.s[0] // .....................................................................*..........................................|............................................................................*......................................... + // mul v16.4s, v12.4s, v25.4s // .......................................................................*........................................|..............................................................................*....................................... + // sqrdmulh v12.4s, v12.4s, v26.4s // ..................................................*.............................................................|.........................................................*............................................................ + // mls v16.4s, v12.4s, v8.s[0] // .............................................................................*..................................|....................................................................................*................................. + // cmge v27.4s, v31.4s, v13.4s // .........................................................................*......................................|................................................................................*..................................... + // cmge v28.4s, v13.4s, v30.4s // ..........................................................................*.....................................|.................................................................................*.................................... + // sub v28.4s, v27.4s, v28.4s // ..............................................................................*.................................|.....................................................................................*................................ + // mls v13.4s, v28.4s, v8.4s // ............................................................................................*...................|...................................................................................................*.................. + // cmge v27.4s, v31.4s, v14.4s // ...........................................................................*....................................|..................................................................................*................................... + // cmge v28.4s, v14.4s, v30.4s // ............................................................................*...................................|...................................................................................*.................................. + // sub v28.4s, v27.4s, v28.4s // .................................................................................*..............................|........................................................................................*............................. + // mls v14.4s, v28.4s, v8.4s // .............................................................................................*..................|....................................................................................................*................. + // cmge v27.4s, v31.4s, v15.4s // ....................................................................................*...........................|...........................................................................................*.......................... + // cmge v28.4s, v15.4s, v30.4s // .....................................................................................*..........................|............................................................................................*......................... + // sub v28.4s, v27.4s, v28.4s // ..........................................................................................*.....................|.................................................................................................*.................... + // mls v15.4s, v28.4s, v8.4s // ................................................................................................*...............|.......................................................................................................*.............. + // cmge v27.4s, v31.4s, v16.4s // .........................................................................................*......................|................................................................................................*..................... + // cmge v28.4s, v16.4s, v30.4s // ........................................................................................*.......................|...............................................................................................*...................... + // sub v28.4s, v27.4s, v28.4s // ...............................................................................................*................|......................................................................................................*............... + // mls v16.4s, v28.4s, v8.4s // .....................................................................................................*..........|............................................................................................................*......... + // str q13, [x0], #(16) // ......................................................................................................*.........|.............................................................................................................*........ + // str q14, [x0, #(-16 + 1*(1024/8))] // .........................................................................................................*......|................................................................................................................*..... + // str q15, [x0, #(-16 + 2*(1024/8))] // ............................................................................................................*...|...................................................................................................................*.. + // str q16, [x0, #(-16 + 3*(1024/8))] // ..............................................................................................................*.|.....................................................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v12.4S, v12.4S, v2.S[1] // *..................................................................................................... + sub v23.4S, v23.4S, v6.4S // .*.................................................................................................... + sqrdmulh v5.4S, v17.4S, v2.S[3] // ...*.................................................................................................. + mul v7.4S, v17.4S, v2.S[2] // ..*................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v20.4S, v23.4S, v3.S[1] // .........*............................................................................................ + // gap // ...................................................................................................... + mul v17.4S, v23.4S, v3.S[0] // ....*................................................................................................. + add v9.4S, v14.4S, v27.4S // ...............*...................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v13.4S, v12.4S, v8.S[0] // ......*............................................................................................... + add v10.4S, v10.4S, v18.4S // .....*................................................................................................ + // gap // ...................................................................................................... + mls v7.4S, v5.4S, v8.S[0] // ........*............................................................................................. + // gap // ...................................................................................................... + sub v23.4S, v14.4S, v27.4S // .......*.............................................................................................. + // gap // ...................................................................................................... + mls v17.4S, v20.4S, v8.S[0] // ..............*....................................................................................... + // gap // ...................................................................................................... + sub v27.4S, v10.4S, v28.4S // ............*......................................................................................... + sub v11.4S, v16.4S, v13.4S // .............*........................................................................................ + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v12.4S, v23.4S, v0.S[3] // ...........*.......................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + add v13.4S, v16.4S, v13.4S // ......................*............................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mul v5.4S, v11.4S, v0.S[2] // .................*.................................................................................... + sub v20.4S, v7.4S, v17.4S // .....................*................................................................................ + mul v23.4S, v23.4S, v0.S[2] // ..........*........................................................................................... + sqrdmulh v15.4S, v11.4S, v0.S[3] // ..................*................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mul v29.4S, v20.4S, v1.S[0] // ...........................*.......................................................................... + // gap // ...................................................................................................... + sqrdmulh v19.4S, v20.4S, v1.S[1] // ............................*......................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + add v17.4S, v7.4S, v17.4S // .........................*............................................................................ + // gap // ...................................................................................................... + mul v4.4S, v27.4S, v1.S[0] // ...................*.................................................................................. + sqrdmulh v11.4S, v27.4S, v1.S[1] // ....................*................................................................................. + mls v5.4S, v15.4S, v8.S[0] // .......................*.............................................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + add v15.4S, v13.4S, v17.4S // ..................................*................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v29.4S, v19.4S, v8.S[0] // ....................................*................................................................. + add v22.4S, v10.4S, v28.4S // ..........................*........................................................................... + mls v23.4S, v12.4S, v8.S[0] // ................*..................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sub v13.4S, v13.4S, v17.4S // .............................*........................................................................ + mls v4.4S, v11.4S, v8.S[0] // ........................*............................................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v11.4S, v15.4S, v26.4S // .................................................................*.................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + add v24.4S, v5.4S, v29.4S // .............................................*........................................................ + add v7.4S, v9.4S, v22.4S // ..............................*....................................................................... + sub v27.4S, v9.4S, v22.4S // ...............................*...................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mul v10.4S, v24.4S, v25.4S // ...........................................................................*.......................... + sqrdmulh v20.4S, v24.4S, v26.4S // ......................................................*............................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mul v17.4S, v7.4S, v25.4S // .................................................*.................................................... + sub v22.4S, v23.4S, v4.4S // .......................................*.............................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v12.4S, v7.4S, v26.4S // ........................................*............................................................. + mul v6.4S, v27.4S, v0.S[0] // ...................................*.................................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v9.4S, v22.4S, v0.S[1] // ..............................................*....................................................... + // gap // ...................................................................................................... + mls v10.4S, v20.4S, v8.S[0] // .................................................................................*.................... + // gap // ...................................................................................................... + mul v14.4S, v22.4S, v0.S[0] // ..........................................*........................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sub v5.4S, v5.4S, v29.4S // ...........................................*.......................................................... + sqrdmulh v16.4S, v27.4S, v0.S[1] // .....................................*................................................................ + mls v17.4S, v12.4S, v8.S[0] // .......................................................................*.............................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + cmge v12.4S, v31.4S, v10.4S // .........................................................................................*............ + cmge v20.4S, v10.4S, v30.4S // ........................................................................................*............. + mls v14.4S, v9.4S, v8.S[0] // ....................................................*................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v24.4S, v5.4S, v0.S[1] // .....................................................*................................................ + cmge v22.4S, v31.4S, v17.4S // .............................................................................*........................ + // gap // ...................................................................................................... + sub v12.4S, v12.4S, v20.4S // ..............................................................................................*....... + // gap // ...................................................................................................... + cmge v20.4S, v17.4S, v30.4S // ..............................................................................*....................... + add v7.4S, v23.4S, v4.4S // .........................................*............................................................ + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v10.4S, v12.4S, v8.4S // .................................................................................................*.... + cmge v23.4S, v31.4S, v14.4S // ..........................................................*........................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v6.4S, v16.4S, v8.S[0] // ............................................*......................................................... + cmge v29.4S, v14.4S, v30.4S // ...........................................................*.......................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sub v4.4S, v22.4S, v20.4S // ..................................................................................*................... + mul v18.4S, v7.4S, v25.4S // ...................................................................*.................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + str q10, [x0, #384] // .....................................................................................................* + mul v20.4S, v15.4S, v25.4S // ................................................................*..................................... + // gap // ...................................................................................................... + sub v28.4S, v23.4S, v29.4S // ...............................................................*...................................... + cmge v9.4S, v31.4S, v6.4S // ..................................................*................................................... + mul v16.4S, v5.4S, v0.S[0] // ...............................................*...................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v17.4S, v4.4S, v8.4S // ............................................................................................*......... + cmge v19.4S, v6.4S, v30.4S // ...................................................*.................................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + sqrdmulh v12.4S, v7.4S, v26.4S // ................................................*..................................................... + // gap // ...................................................................................................... + mls v20.4S, v11.4S, v8.S[0] // ......................................................................*............................... + // gap // ...................................................................................................... + sub v22.4S, v9.4S, v19.4S // .......................................................*.............................................. + sqrdmulh v9.4S, v13.4S, v0.S[1] // .................................*.................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v14.4S, v28.4S, v8.4S // .....................................................................*................................ + mul v27.4S, v13.4S, v0.S[0] // ................................*..................................................................... + str q17, [x0], #(16) // ..................................................................................................*... + // gap // ...................................................................................................... + mls v6.4S, v22.4S, v8.4S // .............................................................*........................................ + mls v18.4S, v12.4S, v8.S[0] // .........................................................................*............................ + // gap // ...................................................................................................... + // gap // ...................................................................................................... + cmge v4.4S, v31.4S, v20.4S // ...............................................................................*...................... + mls v16.4S, v24.4S, v8.S[0] // ............................................................*......................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + cmge v13.4S, v20.4S, v30.4S // ................................................................................*..................... + mls v27.4S, v9.4S, v8.S[0] // ......................................*............................................................... + str q14, [x0, #752] // ..........................................................................*........................... + // gap // ...................................................................................................... + cmge v22.4S, v31.4S, v18.4S // ......................................................................................*............... + str q6, [x0, #496] // .....................................................................................*................ + cmge v12.4S, v18.4S, v30.4S // .......................................................................................*.............. + // gap // ...................................................................................................... + sub v13.4S, v4.4S, v13.4S // ....................................................................................*................. + cmge v21.4S, v31.4S, v16.4S // ....................................................................*................................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + cmge v15.4S, v16.4S, v30.4S // ..................................................................*................................... + cmge v4.4S, v31.4S, v27.4S // .........................................................*............................................ + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + cmge v24.4S, v27.4S, v30.4S // ........................................................*............................................. + sub v12.4S, v22.4S, v12.4S // ..........................................................................................*........... + mls v20.4S, v13.4S, v8.4S // .............................................................................................*........ + sub v23.4S, v21.4S, v15.4S // ............................................................................*......................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v18.4S, v12.4S, v8.4S // ...............................................................................................*...... + sub v24.4S, v4.4S, v24.4S // ..............................................................*....................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + mls v16.4S, v23.4S, v8.4S // ...................................................................................*.................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + str q20, [x0, #112] // ...................................................................................................*.. + mls v27.4S, v24.4S, v8.4S // ........................................................................*............................. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + str q18, [x0, #240] // ....................................................................................................*. + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + str q16, [x0, #880] // ................................................................................................*..... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + str q27, [x0, #624] // ...........................................................................................*.......... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + // gap // ...................................................................................................... + + // original source code + // sqrdmulh v7.4S, v12.4S, v2.S[1] // *..................................................................................................... + // sub v9.4S, v23.4S, v6.4S // .*.................................................................................................... + // mul v24.4S, v17.4S, v2.S[2] // ...*.................................................................................................. + // sqrdmulh v17.4S, v17.4S, v2.S[3] // ..*................................................................................................... + // mul v20.4S, v9.4S, v3.S[0] // .....*................................................................................................ + // add v15.4S, v10.4S, v18.4S // ........*............................................................................................. + // mls v13.4S, v7.4S, v8.S[0] // .......*.............................................................................................. + // sub v23.4S, v14.4S, v27.4S // ..........*........................................................................................... + // mls v24.4S, v17.4S, v8.S[0] // .........*............................................................................................ + // sqrdmulh v18.4S, v9.4S, v3.S[1] // ....*................................................................................................. + // mul v22.4S, v23.4S, v0.S[2] // ..................*................................................................................... + // sqrdmulh v5.4S, v23.4S, v0.S[3] // ..............*....................................................................................... + // sub v12.4S, v15.4S, v28.4S // ............*......................................................................................... + // sub v17.4S, v16.4S, v13.4S // .............*........................................................................................ + // mls v20.4S, v18.4S, v8.S[0] // ...........*.......................................................................................... + // add v10.4S, v14.4S, v27.4S // ......*............................................................................................... + // mls v22.4S, v5.4S, v8.S[0] // .............................*........................................................................ + // mul v9.4S, v17.4S, v0.S[2] // ................*..................................................................................... + // sqrdmulh v18.4S, v17.4S, v0.S[3] // ...................*.................................................................................. + // mul v23.4S, v12.4S, v1.S[0] // .......................*.............................................................................. + // sqrdmulh v29.4S, v12.4S, v1.S[1] // ........................*............................................................................. + // sub v14.4S, v24.4S, v20.4S // .................*.................................................................................... + // add v4.4S, v16.4S, v13.4S // ...............*...................................................................................... + // mls v9.4S, v18.4S, v8.S[0] // .........................*............................................................................ + // mls v23.4S, v29.4S, v8.S[0] // ...............................*...................................................................... + // add v18.4S, v24.4S, v20.4S // ......................*............................................................................... + // add v20.4S, v15.4S, v28.4S // ............................*......................................................................... + // mul v13.4S, v14.4S, v1.S[0] // ....................*................................................................................. + // sqrdmulh v15.4S, v14.4S, v1.S[1] // .....................*................................................................................ + // sub v17.4S, v4.4S, v18.4S // ..............................*....................................................................... + // add v14.4S, v10.4S, v20.4S // ..................................*................................................................... + // sub v29.4S, v10.4S, v20.4S // ...................................*.................................................................. + // mul v12.4S, v17.4S, v0.S[0] // ..........................................................................*........................... + // sqrdmulh v20.4S, v17.4S, v0.S[1] // ........................................................................*............................. + // add v27.4S, v4.4S, v18.4S // ..........................*........................................................................... + // mul v18.4S, v29.4S, v0.S[0] // .........................................*............................................................ + // mls v13.4S, v15.4S, v8.S[0] // ...........................*.......................................................................... + // sqrdmulh v24.4S, v29.4S, v0.S[1] // ..............................................*....................................................... + // mls v12.4S, v20.4S, v8.S[0] // .................................................................................*.................... + // sub v4.4S, v22.4S, v23.4S // .......................................*.............................................................. + // sqrdmulh v5.4S, v14.4S, v26.4S // ........................................*............................................................. + // add v17.4S, v22.4S, v23.4S // .......................................................*.............................................. + // mul v20.4S, v4.4S, v0.S[0] // ............................................*......................................................... + // sub v28.4S, v9.4S, v13.4S // .............................................*........................................................ + // mls v18.4S, v24.4S, v8.S[0] // ..........................................................*........................................... + // add v23.4S, v9.4S, v13.4S // .................................*.................................................................... + // sqrdmulh v6.4S, v4.4S, v0.S[1] // ..........................................*........................................................... + // mul v22.4S, v28.4S, v0.S[0] // ..................................................................*................................... + // sqrdmulh v29.4S, v17.4S, v26.4S // .....................................................................*................................ + // mul v13.4S, v14.4S, v25.4S // ......................................*............................................................... + // cmge v15.4S, v31.4S, v18.4S // .................................................................*.................................... + // cmge v21.4S, v18.4S, v30.4S // ....................................................................*................................. + // mls v20.4S, v6.4S, v8.S[0] // ..................................................*................................................... + // sqrdmulh v9.4S, v28.4S, v0.S[1] // ...................................................*.................................................. + // sqrdmulh v28.4S, v23.4S, v26.4S // .....................................*................................................................ + // sub v24.4S, v15.4S, v21.4S // .......................................................................*.............................. + // cmge v4.4S, v12.4S, v30.4S // ..........................................................................................*........... + // cmge v6.4S, v31.4S, v12.4S // .........................................................................................*............ + // cmge v21.4S, v31.4S, v20.4S // .........................................................*............................................ + // cmge v10.4S, v20.4S, v30.4S // ...........................................................*.......................................... + // mls v22.4S, v9.4S, v8.S[0] // ...............................................................................*...................... + // mls v18.4S, v24.4S, v8.4S // ............................................................................*......................... + // sub v15.4S, v6.4S, v4.4S // ...............................................................................................*...... + // sub v21.4S, v21.4S, v10.4S // ................................................................*..................................... + // mul v4.4S, v27.4S, v25.4S // ...............................................................*...................................... + // sqrdmulh v14.4S, v27.4S, v26.4S // ................................*..................................................................... + // cmge v24.4S, v22.4S, v30.4S // ........................................................................................*............. + // mul v9.4S, v17.4S, v25.4S // .............................................................*........................................ + // cmge v6.4S, v31.4S, v22.4S // .......................................................................................*.............. + // mls v20.4S, v21.4S, v8.4S // .........................................................................*............................ + // mls v4.4S, v14.4S, v8.S[0] // ......................................................................*............................... + // mls v13.4S, v5.4S, v8.S[0] // ...............................................*...................................................... + // mls v12.4S, v15.4S, v8.4S // ..................................................................................................*... + // mls v9.4S, v29.4S, v8.S[0] // .............................................................................*........................ + // str q20, [x0, #768] // ..................................................................................*................... + // mul v20.4S, v23.4S, v25.4S // ....................................*................................................................. + // sub v23.4S, v6.4S, v24.4S // .............................................................................................*........ + // cmge v29.4S, v31.4S, v13.4S // ....................................................*................................................. + // cmge v21.4S, v13.4S, v30.4S // ......................................................*............................................... + // cmge v15.4S, v31.4S, v4.4S // ..............................................................................*....................... + // cmge v5.4S, v4.4S, v30.4S // ................................................................................*..................... + // mls v20.4S, v28.4S, v8.S[0] // ...........................................*.......................................................... + // sub v29.4S, v29.4S, v21.4S // ............................................................*......................................... + // mls v22.4S, v23.4S, v8.4S // ................................................................................................*..... + // sub v21.4S, v15.4S, v5.4S // ......................................................................................*............... + // str q18, [x0, #512] // ....................................................................................*................. + // cmge v27.4S, v31.4S, v9.4S // ...................................................................................*.................. + // cmge v28.4S, v9.4S, v30.4S // .....................................................................................*................ + // cmge v17.4S, v20.4S, v30.4S // .................................................*.................................................... + // cmge v6.4S, v31.4S, v20.4S // ................................................*..................................................... + // sub v15.4S, v27.4S, v28.4S // ...........................................................................................*.......... + // str q12, [x0, #640] // .....................................................................................................* + // mls v13.4S, v29.4S, v8.4S // ...................................................................*.................................. + // mls v4.4S, v21.4S, v8.4S // ............................................................................................*......... + // sub v5.4S, v6.4S, v17.4S // .....................................................*................................................ + // mls v9.4S, v15.4S, v8.4S // ..............................................................................................*....... + // str q22, [x0, #896] // ....................................................................................................*. + // mls v20.4S, v5.4S, v8.4S // ........................................................*............................................. + // str q13, [x0], #(16) // ...........................................................................*.......................... + // str q4, [x0, #112] // .................................................................................................*.... + // str q9, [x0, #240] // ...................................................................................................*.. + // str q20, [x0, #368] // ..............................................................*....................................... + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_dilithium_123_456_78_twiddles.s b/examples/opt/aarch64/intt_dilithium_123_456_78_twiddles.s new file mode 100644 index 00000000..5c61d058 --- /dev/null +++ b/examples/opt/aarch64/intt_dilithium_123_456_78_twiddles.s @@ -0,0 +1,557 @@ + +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +roots_l67: +.word -1744507 +.word 2236726 +.word 1922253 +.word 3818627 +.word -447030292 +.word 573161516 +.word 492577742 +.word 978523985 +.word 731434 +.word 781875 +.word 3773731 +.word -3531229 +.word 187430119 +.word 200355636 +.word 967019376 +.word -904878186 +.word -1054478 +.word -1900052 +.word 3974485 +.word 303005 +.word -270210213 +.word -486888731 +.word 1018462631 +.word 77645096 +.word 2354215 +.word -1011223 +.word 327848 +.word -348812 +.word 603268097 +.word -259126110 +.word 84011120 +.word -89383150 +.word 392707 +.word 1716814 +.word 2193087 +.word -3123762 +.word 100631253 +.word 439933955 +.word 561979013 +.word -800464680 +.word -2926054 +.word 3014420 +.word -2358373 +.word 2185084 +.word -749801963 +.word 772445769 +.word -604333585 +.word 559928242 +.word 459163 +.word 653275 +.word -2312838 +.word 3467665 +.word 117660617 +.word 167401858 +.word -592665232 +.word 888589898 +.word 1514152 +.word -3430436 +.word 553718 +.word 1103344 +.word 388001774 +.word -879049958 +.word 141890356 +.word 282732136 +.word -140244 +.word -860144 +.word -508145 +.word -3105558 +.word -35937555 +.word -220412084 +.word -130212265 +.word -795799901 +.word 2778788 +.word -2683270 +.word 2775755 +.word -1356448 +.word 712065019 +.word -687588511 +.word 711287812 +.word -347590090 +.word 770441 +.word -214880 +.word -3020393 +.word 11879 +.word 197425671 +.word -55063046 +.word -773976352 +.word 3043996 +.word -545376 +.word -3363542 +.word 1370517 +.word -3994671 +.word -139752717 +.word -861908357 +.word 351195274 +.word -1023635298 +.word -3374250 +.word -2925816 +.word 1226661 +.word -3901472 +.word -864652284 +.word -749740976 +.word 314332144 +.word -999753034 +.word 3369273 +.word -2028038 +.word -1723229 +.word -2569011 +.word 863376927 +.word -519685171 +.word -441577800 +.word -658309618 +.word -1163598 +.word -1665318 +.word 1615530 +.word -3980599 +.word -298172236 +.word -426738094 +.word 413979908 +.word -1020029345 +.word -621164 +.word -3035980 +.word -2461387 +.word 1317678 +.word -159173408 +.word -777970524 +.word -630730945 +.word 337655269 +.word 4022750 +.word -4148469 +.word -3009748 +.word 338420 +.word 1030830548 +.word -1063046068 +.word -771248568 +.word 86720197 +.word -749577 +.word 2612853 +.word -2647994 +.word 3033742 +.word -192079267 +.word 669544140 +.word -678549029 +.word 777397036 +.word 2362063 +.word 1300016 +.word 4182915 +.word -3482206 +.word 605279149 +.word 333129378 +.word 1071872863 +.word -892316032 +.word 1834526 +.word 1187885 +.word 1393159 +.word -1994046 +.word 470097680 +.word 304395785 +.word 356997292 +.word -510974714 +.word 724804 +.word -507927 +.word -2491325 +.word 1476985 +.word 185731180 +.word -130156402 +.word -638402564 +.word 378477722 +.word 2254727 +.word 2391089 +.word -1787943 +.word 2579253 +.word 577774276 +.word 612717067 +.word -458160776 +.word 660934133 +.word 2743411 +.word 1179613 +.word 2033807 +.word -2105286 +.word 702999655 +.word 302276083 +.word 521163479 +.word -539479988 +.word -527981 +.word -586241 +.word 2374402 +.word 1623354 +.word -135295244 +.word -150224382 +.word 608441020 +.word 415984810 +.word -3258457 +.word 3250154 +.word -235407 +.word -1736313 +.word -834980303 +.word 832852657 +.word -60323094 +.word -444930577 +.word 2178965 +.word 1879878 +.word 3472069 +.word 1921994 +.word 558360247 +.word 481719139 +.word 889718424 +.word 492511373 +.word 818761 +.word -2039144 +.word -4040196 +.word 458740 +.word 209807681 +.word -522531086 +.word -1035301089 +.word 117552223 +.word 3197248 +.word -1987814 +.word 3488383 +.word 4166425 +.word 819295484 +.word -509377762 +.word 893898890 +.word 1067647297 +.word 2218467 +.word -613238 +.word -2513018 +.word -141835 +.word 568482643 +.word -157142369 +.word -643961400 +.word -36345249 +.word 1310261 +.word 1354892 +.word 89301 +.word -2998219 +.word 335754661 +.word 347191365 +.word 22883400 +.word -768294260 +.word 3334383 +.word -2462444 +.word -169688 +.word 565603 +.word 854436357 +.word -631001801 +.word -43482586 +.word 144935890 +.word 12417 +.word -2642980 +.word 3838479 +.word -2296099 +.word 3181859 +.word -677264190 +.word 983611064 +.word -588375860 +.word -1254190 +.word -3195676 +.word -1239911 +.word -3747250 +.word -321386456 +.word -818892658 +.word -317727459 +.word -960233614 +.word 2962264 +.word -1148858 +.word -482649 +.word -1528066 +.word 759080783 +.word -294395108 +.word -123678909 +.word -391567239 +.word 3180456 +.word 3611750 +.word 1727088 +.word 1772588 +.word 814992530 +.word 925511710 +.word 442566669 +.word 454226054 +.word 268456 +.word -2387513 +.word -2192938 +.word 4146264 +.word 68791907 +.word -611800717 +.word -561940831 +.word 1062481036 +.word -4158088 +.word 1109516 +.word 2983781 +.word -2811291 +.word -1065510939 +.word 284313712 +.word 764594519 +.word -720393920 +.word 2455377 +.word -635956 +.word 3768948 +.word 3410568 +.word 629190881 +.word -162963861 +.word 965793731 +.word 873958779 +.word 250446 +.word 3551006 +.word -2678278 +.word 1685153 +.word 64176841 +.word 909946047 +.word -686309310 +.word 431820817 +.word 3815725 +.word -1937570 +.word -2028118 +.word -2508980 +.word 977780347 +.word -496502727 +.word -519705671 +.word -642926661 +.word 3759465 +.word -1596822 +.word 2454145 +.word -822541 +.word 963363710 +.word -409185979 +.word 628875181 +.word -210776307 +.word 3956944 +.word 1979497 +.word -1009365 +.word 27812 +.word 1013967746 +.word 507246529 +.word -258649997 +.word 7126831 +.word 274060 +.word 3121440 +.word 3222807 +.word -4183372 +.word 70227934 +.word 799869667 +.word 825844983 +.word -1071989969 +.word 3716946 +.word 2296397 +.word 3965306 +.word -87208 +.word 952468207 +.word 588452222 +.word 1016110510 +.word -22347069 +.word 3284915 +.word 3956745 +.word -636927 +.word -1182243 +.word 841760171 +.word 1013916752 +.word -163212680 +.word -302950022 +.word -3852015 +.word 2635473 +.word -1277625 +.word -3073009 +.word -987079667 +.word 675340520 +.word -327391679 +.word -787459213 +.word -2772600 +.word 1780227 +.word 1455890 +.word 1935420 +.word -710479343 +.word 456183549 +.word 373072124 +.word 495951789 +.word 59148 +.word -2660408 +.word 2659525 +.word -1753 +.word 15156688 +.word -681730119 +.word 681503850 +.word -449207 +roots_l345: +.word 1221177 +.word 312926867 +.word -2283733 +.word -585207070 +.word -2815639 +.word -721508096 +.word -1858416 +.word -476219497 +.word -3345963 +.word -857403734 +.word -1853806 +.word -475038184 +.word -2917338 +.word -747568486 +.word 0 +.word 0 +.word -557458 +.word -142848732 +.word 3585098 +.word 918682129 +.word 642628 +.word 164673562 +.word -3870317 +.word -991769559 +.word -556856 +.word -142694469 +.word -3192354 +.word -818041395 +.word 2897314 +.word 742437332 +.word 0 +.word 0 +.word 1005239 +.word 257592709 +.word -1460718 +.word -374309300 +.word -2453983 +.word -628833668 +.word 3950053 +.word 1012201926 +.word 1716988 +.word 439978542 +.word 1935799 +.word 496048908 +.word -3756790 +.word -962678241 +.word 0 +.word 0 +.word -3764867 +.word -964747974 +.word -1714295 +.word -439288460 +.word 3227876 +.word 827143915 +.word 3574466 +.word 915957677 +.word 817536 +.word 209493775 +.word -1759347 +.word -450833045 +.word -3415069 +.word -875112161 +.word 0 +.word 0 +.word -2129892 +.word -545785280 +.word 1335936 +.word 342333886 +.word -676590 +.word -173376332 +.word -2156050 +.word -552488273 +.word -3241972 +.word -830756018 +.word 4018989 +.word 1029866791 +.word -2071829 +.word -530906624 +.word 0 +.word 0 +.word -2682288 +.word -687336873 +.word 434125 +.word 111244624 +.word 3524442 +.word 903139016 +.word 3506380 +.word 898510625 +.word -1095468 +.word -280713909 +.word -928749 +.word -237992130 +.word -394148 +.word -101000509 +.word 0 +.word 0 +.word -3542485 +.word -907762539 +.word 1674615 +.word 429120452 +.word -2663378 +.word -682491182 +.word -1159875 +.word -297218217 +.word -3704823 +.word -949361686 +.word -2101410 +.word -538486762 +.word 3110818 +.word 797147778 +.word 0 +.word 0 +.word 601683 +.word 154181397 +.word 4063053 +.word 1041158200 +.word 3370349 +.word 863652652 +.word 3586446 +.word 919027554 +.word -2740543 +.word -702264730 +.word -3182878 +.word -815613168 +.word -3602218 +.word -923069133 +.word 0 +.word 0 +roots_l012: +.word 3572223 +.word 915382907 +.word -3761513 +.word -963888510 +.word -3765607 +.word -964937599 +.word 3201430 +.word 820367122 +.word 3145678 +.word 806080660 +.word 2883726 +.word 738955404 +.word 3201494 +.word 820383522 +.word 0 +.word 0 \ No newline at end of file diff --git a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a55.s b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a55.s new file mode 100644 index 00000000..3b1ae537 --- /dev/null +++ b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a55.s @@ -0,0 +1,1482 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_manual_ld4_opt_a55 + .global _intt_kyber_123_4567_manual_ld4_opt_a55 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_manual_ld4_opt_a55: +_intt_kyber_123_4567_manual_ld4_opt_a55: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // .*........................................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + ldr q22, [x4, #64] // ..*....................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v19.8H, v5.8H, v6.8H // ...*...................................................... + // gap // .......................................................... + ldr q23, [x4, #80] // ...........*.............................................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mul v29.8H, v19.8H, v22.8H // ....*..................................................... + // gap // .......................................................... + ldr q26, [x4, #48] // .....*.................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v0.8H, v3.8H, v4.8H // .......*.................................................. + // gap // .......................................................... + ldr q28, [x4, #32] // .........*................................................ + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v19.8H, v19.8H, v23.8H // .............*............................................ + // gap // .......................................................... + sqrdmulh v22.8H, v0.8H, v26.8H // ..........*............................................... + // gap // .......................................................... + mul v24.8H, v0.8H, v28.8H // ............*............................................. + // gap // .......................................................... + add v27.8H, v5.8H, v6.8H // ...............*.......................................... + // gap // .......................................................... + mls v29.8H, v19.8H, v7.H[0] // ................*......................................... + // gap // .......................................................... + add v3.8H, v3.8H, v4.8H // ........*................................................. + // gap // .......................................................... + mls v24.8H, v22.8H, v7.H[0] // ..............*........................................... + // gap // .......................................................... + ldr q23, [x4, #16] // *......................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v28.8H, v3.8H, v27.8H // .................*........................................ + // gap // .......................................................... + sub v22.8H, v24.8H, v29.8H // ...................*...................................... + // gap // .......................................................... + ldr q0, [x4], #(6*16) // ......*................................................... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sqrdmulh v19.8H, v22.8H, v23.8H // .......................*.................................. + // gap // .......................................................... + sqrdmulh v23.8H, v28.8H, v23.8H // .....................*.................................... + // gap // .......................................................... + mul v22.8H, v22.8H, v0.8H // ......................*................................... + // gap // .......................................................... + mul v0.8H, v28.8H, v0.8H // ....................*..................................... + // gap // .......................................................... + add v28.8H, v24.8H, v29.8H // ........................*................................. + // gap // .......................................................... + add v27.8H, v3.8H, v27.8H // ..................*....................................... + // gap // .......................................................... + mls v22.8H, v19.8H, v7.H[0] // ..........................*............................... + // gap // .......................................................... + mls v0.8H, v23.8H, v7.H[0] // .........................*................................ + // gap // .......................................................... + trn1 v23.4S, v27.4S, v28.4S // ..............................*........................... + // gap // .......................................................... + ldr q11, [x3], #16 // ............................*............................. + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + trn1 v19.4S, v0.4S, v22.4S // ...............................*.......................... + // gap // .......................................................... + trn2 v22.4S, v0.4S, v22.4S // .............................*............................ + // gap // .......................................................... + trn2 v28.4S, v27.4S, v28.4S // ...........................*.............................. + // gap // .......................................................... + trn1 v0.2D, v23.2D, v19.2D // ...................................*...................... + // gap // .......................................................... + trn2 v19.2D, v23.2D, v19.2D // ..................................*....................... + // gap // .......................................................... + trn1 v3.2D, v28.2D, v22.2D // .................................*........................ + // gap // .......................................................... + trn2 v23.2D, v28.2D, v22.2D // ................................*......................... + // gap // .......................................................... + add v27.8H, v0.8H, v3.8H // .....................................*.................... + // gap // .......................................................... + add v24.8H, v19.8H, v23.8H // ....................................*..................... + // gap // .......................................................... + sub v23.8H, v19.8H, v23.8H // ...........................................*.............. + // gap // .......................................................... + sqdmulh v22.8H, v27.8H, v7.H[1] // .......................................*.................. + // gap // .......................................................... + sqdmulh v28.8H, v24.8H, v7.H[1] // ......................................*................... + // gap // .......................................................... + sqrdmulh v19.8H, v23.8H, v11.H[5] // ..................................................*....... + // gap // .......................................................... + mul v8.8H, v23.8H, v11.H[4] // ................................................*......... + // gap // .......................................................... + srshr v23.8H, v22.8H, #11 // ..........................................*............... + // gap // .......................................................... + srshr v22.8H, v28.8H, #11 // .........................................*................ + // gap // .......................................................... + sub v28.8H, v0.8H, v3.8H // ........................................*................. + // gap // .......................................................... + mls v27.8H, v23.8H, v7.H[0] // .............................................*............ + // gap // .......................................................... + mls v24.8H, v22.8H, v7.H[0] // ............................................*............. + // gap // .......................................................... + mls v8.8H, v19.8H, v7.H[0] // .......................................................*.. + // gap // .......................................................... + sqrdmulh v0.8H, v28.8H, v11.H[3] // ..............................................*........... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + sub v22.8H, v27.8H, v24.8H // .................................................*........ + // gap // .......................................................... + add v23.8H, v27.8H, v24.8H // ...................................................*...... + // gap // .......................................................... + mul v28.8H, v28.8H, v11.H[2] // ...............................................*.......... + // gap // .......................................................... + sqrdmulh v19.8H, v22.8H, v11.H[1] // .....................................................*.... + // gap // .......................................................... + mul v27.8H, v22.8H, v11.H[0] // ....................................................*..... + // gap // .......................................................... + str q23, [x1], #(64) // ........................................................*. + // gap // .......................................................... + mls v28.8H, v0.8H, v7.H[0] // ......................................................*... + // gap // .......................................................... + // gap // .......................................................... + // gap // .......................................................... + mls v27.8H, v19.8H, v7.H[0] // .........................................................* + // gap // .......................................................... + + // original source code + // ldr q22, [x4, #16] // ...............*.......................................... + // ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x1] // *......................................................... + // ldr q24, [x4, #64] // .*........................................................ + // sub v29.8H, v15.8H, v16.8H // ..*....................................................... + // mul v18.8H, v29.8H, v24.8H // ....*..................................................... + // ldr q27, [x4, #48] // .....*.................................................... + // ldr q19, [x4], #(6*16) // ..................*....................................... + // sub v26.8H, v13.8H, v14.8H // ......*................................................... + // add v0.8H, v13.8H, v14.8H // .............*............................................ + // ldr q23, [x4, #-64] // .......*.................................................. + // sqrdmulh v27.8H, v26.8H, v27.8H // .........*................................................ + // ldr q20, [x4, #-16] // ...*...................................................... + // mul v23.8H, v26.8H, v23.8H // ..........*............................................... + // sqrdmulh v3.8H, v29.8H, v20.8H // ........*................................................. + // mls v23.8H, v27.8H, v7.H[0] // ..............*........................................... + // add v27.8H, v15.8H, v16.8H // ...........*.............................................. + // mls v18.8H, v3.8H, v7.H[0] // ............*............................................. + // sub v29.8H, v0.8H, v27.8H // ................*......................................... + // add v28.8H, v0.8H, v27.8H // ........................*................................. + // sub v3.8H, v23.8H, v18.8H // .................*........................................ + // mul v20.8H, v29.8H, v19.8H // ......................*................................... + // sqrdmulh v29.8H, v29.8H, v22.8H // ....................*..................................... + // mul v19.8H, v3.8H, v19.8H // .....................*.................................... + // sqrdmulh v22.8H, v3.8H, v22.8H // ...................*...................................... + // add v9.8H, v23.8H, v18.8H // .......................*.................................. + // mls v20.8H, v29.8H, v7.H[0] // ..........................*............................... + // mls v19.8H, v22.8H, v7.H[0] // .........................*................................ + // trn2 v31.4S, v28.4S, v9.4S // ...............................*.......................... + // ldr q11, [x3], #16 // ............................*............................. + // trn2 v21.4S, v20.4S, v19.4S // ..............................*........................... + // trn1 v13.4S, v28.4S, v9.4S // ...........................*.............................. + // trn1 v28.4S, v20.4S, v19.4S // .............................*............................ + // trn2 v27.2D, v31.2D, v21.2D // ...................................*...................... + // trn1 v19.2D, v31.2D, v21.2D // ..................................*....................... + // trn2 v0.2D, v13.2D, v28.2D // .................................*........................ + // trn1 v22.2D, v13.2D, v28.2D // ................................*......................... + // add v9.8H, v0.8H, v27.8H // .....................................*.................... + // add v24.8H, v22.8H, v19.8H // ....................................*..................... + // sqdmulh v23.8H, v9.8H, v7.H[1] // ........................................*................. + // sqdmulh v13.8H, v24.8H, v7.H[1] // .......................................*.................. + // sub v31.8H, v22.8H, v19.8H // .............................................*............ + // srshr v23.8H, v23.8H, #11 // ............................................*............. + // srshr v1.8H, v13.8H, #11 // ...........................................*.............. + // sub v21.8H, v0.8H, v27.8H // ......................................*................... + // mls v9.8H, v23.8H, v7.H[0] // ...............................................*.......... + // mls v24.8H, v1.8H, v7.H[0] // ..............................................*........... + // sqrdmulh v19.8H, v31.8H, v11.H[3] // .................................................*........ + // mul v28.8H, v31.8H, v11.H[2] // ....................................................*..... + // mul v8.8H, v21.8H, v11.H[4] // ..........................................*............... + // sub v22.8H, v24.8H, v9.8H // ..................................................*....... + // sqrdmulh v23.8H, v21.8H, v11.H[5] // .........................................*................ + // add v12.8H, v24.8H, v9.8H // ...................................................*...... + // mul v27.8H, v22.8H, v11.H[0] // ......................................................*... + // sqrdmulh v13.8H, v22.8H, v11.H[1] // .....................................................*.... + // mls v28.8H, v19.8H, v7.H[0] // ........................................................*. + // mls v8.8H, v23.8H, v7.H[0] // ................................................*......... + // str q12, [x1], #(64) // .......................................................*.. + // mls v27.8H, v13.8H, v7.H[0] // .........................................................* + + sub count, count, #1 +layer4567_start: + ldr q22, [x4, #16] // ..e..................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ld4 {v13.4S, v14.4S, v15.4S, v16.4S}, [x1] // e....................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q24, [x4, #64] // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v29.8H, v15.8H, v16.8H // ............e........................................................... + // gap // ........................................................................ + sqdmulh v23.8H, v28.8H, v7.H[1] // .................................................*...................... + // gap // ........................................................................ + str q27, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + mul v18.8H, v29.8H, v24.8H // ..............e......................................................... + // gap // ........................................................................ + ldr q27, [x4, #48] // ....e................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q19, [x4], #(6*16) // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v23.8H, v23.8H, #11 // ..................................................*..................... + // gap // ........................................................................ + sqdmulh v24.8H, v8.8H, v7.H[1] // .......................................................*................ + // gap // ........................................................................ + sub v26.8H, v13.8H, v14.8H // .......e................................................................ + // gap // ........................................................................ + add v0.8H, v13.8H, v14.8H // ........e............................................................... + // gap // ........................................................................ + mls v28.8H, v23.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + srshr v24.8H, v24.8H, #11 // ........................................................*............... + // gap // ........................................................................ + ldr q23, [x4, #-64] // ...e.................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v8.8H, v24.8H, v7.H[0] // .........................................................*.............. + // gap // ........................................................................ + sqrdmulh v27.8H, v26.8H, v27.8H // ..........e............................................................. + // gap // ........................................................................ + ldr q20, [x4, #-16] // ......e................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v30.8H, v28.8H, v8.8H // ...............................................................*........ + // gap // ........................................................................ + mul v23.8H, v26.8H, v23.8H // .........e.............................................................. + // gap // ........................................................................ + sqrdmulh v3.8H, v29.8H, v20.8H // ...............e........................................................ + // gap // ........................................................................ + add v28.8H, v28.8H, v8.8H // ................................................................*....... + // gap // ........................................................................ + sqrdmulh v26.8H, v30.8H, v11.H[1] // ..................................................................*..... + // gap // ........................................................................ + mls v23.8H, v27.8H, v7.H[0] // ...........e............................................................ + // gap // ........................................................................ + add v27.8H, v15.8H, v16.8H // .............e.......................................................... + // gap // ........................................................................ + mls v18.8H, v3.8H, v7.H[0] // ................e....................................................... + // gap // ........................................................................ + str q28, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + sub v29.8H, v0.8H, v27.8H // .................e...................................................... + // gap // ........................................................................ + add v28.8H, v0.8H, v27.8H // ..................e..................................................... + // gap // ........................................................................ + sub v3.8H, v23.8H, v18.8H // ......................e................................................. + // gap // ........................................................................ + mul v20.8H, v29.8H, v19.8H // ...................e.................................................... + // gap // ........................................................................ + sqrdmulh v29.8H, v29.8H, v22.8H // ....................e................................................... + // gap // ........................................................................ + mul v19.8H, v3.8H, v19.8H // ........................e............................................... + // gap // ........................................................................ + sqrdmulh v22.8H, v3.8H, v22.8H // .........................e.............................................. + // gap // ........................................................................ + add v9.8H, v23.8H, v18.8H // .......................e................................................ + // gap // ........................................................................ + mls v20.8H, v29.8H, v7.H[0] // .....................e.................................................. + // gap // ........................................................................ + mul v3.8H, v30.8H, v11.H[0] // .................................................................*...... + // gap // ........................................................................ + mls v19.8H, v22.8H, v7.H[0] // ..........................e............................................. + // gap // ........................................................................ + trn2 v31.4S, v28.4S, v9.4S // ............................e........................................... + // gap // ........................................................................ + ldr q11, [x3], #16 // ...................................e.................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v21.4S, v20.4S, v19.4S // ..............................e......................................... + // gap // ........................................................................ + trn1 v13.4S, v28.4S, v9.4S // ...........................e............................................ + // gap // ........................................................................ + trn1 v28.4S, v20.4S, v19.4S // .............................e.......................................... + // gap // ........................................................................ + trn2 v27.2D, v31.2D, v21.2D // ................................e....................................... + // gap // ........................................................................ + trn1 v19.2D, v31.2D, v21.2D // ..................................e..................................... + // gap // ........................................................................ + trn2 v0.2D, v13.2D, v28.2D // ...............................e........................................ + // gap // ........................................................................ + trn1 v22.2D, v13.2D, v28.2D // .................................e...................................... + // gap // ........................................................................ + add v9.8H, v0.8H, v27.8H // ..........................................e............................. + // gap // ........................................................................ + add v24.8H, v22.8H, v19.8H // .....................................e.................................. + // gap // ........................................................................ + mls v3.8H, v26.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + sqdmulh v23.8H, v9.8H, v7.H[1] // ....................................................e................... + // gap // ........................................................................ + sqdmulh v13.8H, v24.8H, v7.H[1] // ..............................................e......................... + // gap // ........................................................................ + sub v31.8H, v22.8H, v19.8H // ....................................e................................... + // gap // ........................................................................ + str q3, [x1, #-16] // .......................................................................* + // gap // ........................................................................ + srshr v23.8H, v23.8H, #11 // .....................................................e.................. + // gap // ........................................................................ + srshr v1.8H, v13.8H, #11 // ...............................................e........................ + // gap // ........................................................................ + sub v21.8H, v0.8H, v27.8H // .........................................e.............................. + // gap // ........................................................................ + mls v9.8H, v23.8H, v7.H[0] // ......................................................e................. + // gap // ........................................................................ + mls v24.8H, v1.8H, v7.H[0] // ................................................e....................... + // gap // ........................................................................ + sqrdmulh v19.8H, v31.8H, v11.H[3] // .......................................e................................ + // gap // ........................................................................ + mul v28.8H, v31.8H, v11.H[2] // ......................................e................................. + // gap // ........................................................................ + mul v8.8H, v21.8H, v11.H[4] // ...........................................e............................ + // gap // ........................................................................ + sub v22.8H, v24.8H, v9.8H // ..........................................................e............. + // gap // ........................................................................ + sqrdmulh v23.8H, v21.8H, v11.H[5] // ............................................e........................... + // gap // ........................................................................ + add v12.8H, v24.8H, v9.8H // ...........................................................e............ + // gap // ........................................................................ + mul v27.8H, v22.8H, v11.H[0] // ............................................................e........... + // gap // ........................................................................ + sqrdmulh v13.8H, v22.8H, v11.H[1] // .............................................................e.......... + // gap // ........................................................................ + mls v28.8H, v19.8H, v7.H[0] // ........................................e............................... + // gap // ........................................................................ + mls v8.8H, v23.8H, v7.H[0] // .............................................e.......................... + // gap // ........................................................................ + str q12, [x1], #(64) // ....................................................................e... + // gap // ........................................................................ + mls v27.8H, v13.8H, v7.H[0] // ..............................................................e......... + // gap // ........................................................................ + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // .e......................................................................|e..................................................... + // ldr q0, [x4], #(6*16) // ........e...............................................................|.......e.............................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // e.......................................................................e...................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // ...............e........................................................|..............e....................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .......e................................................................|......e............................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ..e.....................................................................|.e.................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ..................e.....................................................|.................e.................................... + // sub v24.8h, v8.8h, v9.8h // ...........e............................................................|..........e........................................... + // add v8.8h, v8.8h, v9.8h // ............e...........................................................|...........e.......................................... + // mul v9.8h, v24.8h, v1.8h // ....................e...................................................|...................e.................................. + // sqrdmulh v24.8h, v24.8h, v5.8h // .................e......................................................|................e..................................... + // mls v9.8h, v24.8h, v7.h[0] // ........................e...............................................|.......................e.............................. + // sub v24.8h, v10.8h, v11.8h // ...e....................................................................|..e................................................... + // add v10.8h, v10.8h, v11.8h // .........................e..............................................|........................e............................. + // mul v11.8h, v24.8h, v2.8h // ......e.................................................................|.....e................................................ + // sqrdmulh v24.8h, v24.8h, v6.8h // .....................e..................................................|....................e................................. + // mls v11.8h, v24.8h, v7.h[0] // ..........................e.............................................|.........................e............................ + // sub v24.8h, v8.8h, v10.8h // ............................e...........................................|...........................e.......................... + // add v8.8h, v8.8h, v10.8h // .............................e..........................................|............................e......................... + // mul v10.8h, v24.8h, v0.8h // ...............................e........................................|..............................e....................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ................................e.......................................|...............................e...................... + // mls v10.8h, v24.8h, v7.h[0] // ....................................e...................................|...................................e.................. + // sub v24.8h, v9.8h, v11.8h // ..............................e.........................................|.............................e........................ + // add v9.8h, v9.8h, v11.8h // ...................................e....................................|..................................e................... + // mul v11.8h, v24.8h, v0.8h // .................................e......................................|................................e..................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ..................................e.....................................|.................................e.................... + // mls v11.8h, v24.8h, v7.h[0] // ......................................e.................................|.....................................e................ + // trn1 v25.4s, v8.4s, v9.4s // ..........................................e.............................|.........................................e............ + // trn2 v26.4s, v8.4s, v9.4s // .......................................e................................|......................................e............... + // trn1 v27.4s, v10.4s, v11.4s // ...........................................e............................|..........................................e........... + // trn2 v28.4s, v10.4s, v11.4s // .........................................e..............................|........................................e............. + // trn2 v10.2d, v25.2d, v27.2d // ..............................................e.........................|.............................................e........ + // trn2 v11.2d, v26.2d, v28.2d // ............................................e...........................|...........................................e.......... + // trn1 v8.2d, v25.2d, v27.2d // ...............................................e........................|..............................................e....... + // trn1 v9.2d, v26.2d, v28.2d // .............................................e..........................|............................................e......... + // ldr q0, [x3], #16 // ........................................e...............................|.......................................e.............. + // sub v24.8h, v8.8h, v9.8h // .....................................................e..................|....................................................e. + // add v8.8h, v8.8h, v9.8h // .................................................e......................|................................................e..... + // mul v9.8h, v24.8h, v0.h[2] // .............................................................e..........|...................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............................................................e...........|...................................................... + // mls v9.8h, v24.8h, v7.h[0] // ....................................................................e...|...................................................... + // sub v24.8h, v10.8h, v11.8h // .........................................................e..............|...................................................... + // add v10.8h, v10.8h, v11.8h // ................................................e.......................|...............................................e...... + // mul v11.8h, v24.8h, v0.h[4] // ..............................................................e.........|...................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ................................................................e.......|...................................................... + // mls v11.8h, v24.8h, v7.h[0] // .....................................................................e..|...................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ....................................................e...................|...................................................e.. + // srshr v25.8h, v25.8h, #11 // ........................................................e...............|...................................................... + // mls v8.8h, v25.8h, v7.h[0] // ...........................................................e............|...................................................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ....*...................................................................|...*.................................................. + // srshr v25.8h, v25.8h, #11 // .........*..............................................................|........*............................................. + // mls v9.8h, v25.8h, v7.h[0] // .............*..........................................................|............*......................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ...................................................e....................|..................................................e... + // srshr v25.8h, v25.8h, #11 // .......................................................e................|...................................................... + // mls v10.8h, v25.8h, v7.h[0] // ..........................................................e.............|...................................................... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ..........*.............................................................|.........*............................................ + // srshr v25.8h, v25.8h, #11 // ..............*.........................................................|.............*........................................ + // mls v11.8h, v25.8h, v7.h[0] // ................*.......................................................|...............*...................................... + // sub v24.8h, v8.8h, v10.8h // ...............................................................e........|...................................................... + // add v8.8h, v8.8h, v10.8h // .................................................................e......|...................................................... + // mul v10.8h, v24.8h, v0.h[0] // ..................................................................e.....|...................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................................................e....|...................................................... + // mls v10.8h, v24.8h, v7.h[0] // .......................................................................e|...................................................... + // sub v24.8h, v9.8h, v11.8h // ...................*....................................................|..................*................................... + // add v9.8h, v9.8h, v11.8h // ......................*.................................................|.....................*................................ + // mul v11.8h, v24.8h, v0.h[0] // .....................................*..................................|....................................*................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................*................................................|......................*............................... + // mls v11.8h, v24.8h, v7.h[0] // ..................................................*.....................|.................................................*.... + // str q8, [x1], #(64) // ......................................................................e.|...................................................... + // str q9, [x1, #(-64 + 16*1)] // ...........................*............................................|..........................*........................... + // str q10, [x1, #(-64 + 16*2)] // .....*..................................................................|....*................................................. + // str q11, [x1, #(-64 + 16*3)] // ......................................................*.................|.....................................................* + + sub count, count, #1 + cbnz count, layer4567_start + sqdmulh v19.8H, v28.8H, v7.H[1] // *............. + // gap // .............. + sqdmulh v23.8H, v8.8H, v7.H[1] // ...*.......... + // gap // .............. + str q27, [x1, #-32] // .*............ + // gap // .............. + // gap // .............. + // gap // .............. + srshr v19.8H, v19.8H, #11 // ..*........... + // gap // .............. + srshr v23.8H, v23.8H, #11 // .....*........ + // gap // .............. + // gap // .............. + // gap // .............. + mls v28.8H, v19.8H, v7.H[0] // ....*......... + // gap // .............. + mls v8.8H, v23.8H, v7.H[0] // ......*....... + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + sub v19.8H, v28.8H, v8.8H // .......*...... + // gap // .............. + add v23.8H, v28.8H, v8.8H // ........*..... + // gap // .............. + // gap // .............. + // gap // .............. + sqrdmulh v22.8H, v19.8H, v11.H[1] // .........*.... + // gap // .............. + mul v19.8H, v19.8H, v11.H[0] // ...........*.. + // gap // .............. + str q23, [x1, #-48] // ..........*... + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + mls v19.8H, v22.8H, v7.H[0] // ............*. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + // gap // .............. + str q19, [x1, #-16] // .............* + // gap // .............. + + // original source code + // sqdmulh v23.8H, v28.8H, v7.H[1] // *............. + // str q27, [x1, #-32] // ..*........... + // srshr v23.8H, v23.8H, #11 // ...*.......... + // sqdmulh v24.8H, v8.8H, v7.H[1] // .*............ + // mls v28.8H, v23.8H, v7.H[0] // .....*........ + // srshr v24.8H, v24.8H, #11 // ....*......... + // mls v8.8H, v24.8H, v7.H[0] // ......*....... + // sub v30.8H, v28.8H, v8.8H // .......*...... + // add v28.8H, v28.8H, v8.8H // ........*..... + // sqrdmulh v26.8H, v30.8H, v11.H[1] // .........*.... + // str q28, [x1, #-48] // ...........*.. + // mul v3.8H, v30.8H, v11.H[0] // ..........*... + // mls v3.8H, v26.8H, v7.H[0] // ............*. + // str q3, [x1, #-16] // .............* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q3, [x0, #256] // *........... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q11, [x0, #448] // .*.......... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q20, [x0, #384] // ..*......... + // gap // ............ + // gap // ............ + // gap // ............ + ldr q26, [x0, #320] // ...*........ + // gap // ............ + // gap // ............ + // gap // ............ + add v5.8H, v20.8H, v11.8H // .........*.. + // gap // ............ + ldr q28, [x0, #192] // .....*...... + // gap // ............ + // gap // ............ + // gap // ............ + add v25.8H, v3.8H, v26.8H // ........*... + // gap // ............ + ldr q22, [x0, #128] // ......*..... + // gap // ............ + // gap // ............ + // gap // ............ + add v14.8H, v25.8H, v5.8H // ...........* + // gap // ............ + ldr q23, [x0, #64] // ....*....... + // gap // ............ + // gap // ............ + // gap // ............ + add v24.8H, v22.8H, v28.8H // .......*.... + // gap // ............ + ldr q27, [x0, #0] // ..........*. + // gap // ............ + + // original source code + // ldr q3, [x0, #256] // *........... + // ldr q11, [x0, #448] // .*.......... + // ldr q20, [x0, #384] // ..*......... + // ldr q26, [x0, #320] // ...*........ + // ldr q23, [x0, #64] // .........*.. + // ldr q28, [x0, #192] // .....*...... + // ldr q22, [x0, #128] // .......*.... + // add v24.8H, v22.8H, v28.8H // ..........*. + // add v25.8H, v3.8H, v26.8H // ......*..... + // add v5.8H, v20.8H, v11.8H // ....*....... + // ldr q27, [x0, #0] // ...........* + // add v14.8H, v25.8H, v5.8H // ........*... + + sub count, count, #1 +layer123_start: + sub v19.8H, v27.8H, v23.8H // ........*............................................................................... + // gap // ........................................................................................ + add v23.8H, v27.8H, v23.8H // .........*.............................................................................. + // gap // ........................................................................................ + sub v22.8H, v22.8H, v28.8H // .............*.......................................................................... + // gap // ........................................................................................ + mul v28.8H, v19.8H, v0.H[6] // ..........*............................................................................. + // gap // ........................................................................................ + sqrdmulh v19.8H, v19.8H, v0.H[7] // ...........*............................................................................ + // gap // ........................................................................................ + sub v27.8H, v23.8H, v24.8H // ............................*........................................................... + // gap // ........................................................................................ + add v23.8H, v23.8H, v24.8H // .............................*.......................................................... + // gap // ........................................................................................ + mul v24.8H, v22.8H, v1.H[0] // ...............*........................................................................ + // gap // ........................................................................................ + sqrdmulh v22.8H, v22.8H, v1.H[1] // ................*....................................................................... + // gap // ........................................................................................ + mls v28.8H, v19.8H, v7.H[0] // ............*........................................................................... + // gap // ........................................................................................ + sub v19.8H, v3.8H, v26.8H // ..................*..................................................................... + // gap // ........................................................................................ + mul v3.8H, v27.8H, v0.H[2] // ..............................*......................................................... + // gap // ........................................................................................ + sqrdmulh v27.8H, v27.8H, v0.H[3] // ...............................*........................................................ + // gap // ........................................................................................ + sub v26.8H, v23.8H, v14.8H // ................................................*....................................... + // gap // ........................................................................................ + add v23.8H, v23.8H, v14.8H // .................................................*...................................... + // gap // ........................................................................................ + mls v24.8H, v22.8H, v7.H[0] // .................*...................................................................... + // gap // ........................................................................................ + mul v22.8H, v19.8H, v1.H[2] // ....................*................................................................... + // gap // ........................................................................................ + sqrdmulh v19.8H, v19.8H, v1.H[3] // .....................*.................................................................. + // gap // ........................................................................................ + sub v20.8H, v20.8H, v11.8H // .......................*................................................................ + // gap // ........................................................................................ + sub v11.8H, v28.8H, v24.8H // .................................*...................................................... + // gap // ........................................................................................ + add v28.8H, v28.8H, v24.8H // ..................................*..................................................... + // gap // ........................................................................................ + mls v22.8H, v19.8H, v7.H[0] // ......................*................................................................. + // gap // ........................................................................................ + mul v19.8H, v20.8H, v1.H[4] // .........................*.............................................................. + // gap // ........................................................................................ + mls v3.8H, v27.8H, v7.H[0] // ................................*....................................................... + // gap // ........................................................................................ + sqrdmulh v27.8H, v20.8H, v1.H[5] // ..........................*............................................................. + // gap // ........................................................................................ + mul v24.8H, v11.8H, v0.H[2] // ...................................*.................................................... + // gap // ........................................................................................ + sqrdmulh v20.8H, v11.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + mul v4.8H, v26.8H, v0.H[0] // ..................................................*..................................... + // gap // ........................................................................................ + sqrdmulh v11.8H, v26.8H, v0.H[1] // ...................................................*.................................... + // gap // ........................................................................................ + mul v26.8H, v23.8H, v29.8H // ........................................................................*............... + // gap // ........................................................................................ + sqrdmulh v6.8H, v23.8H, v30.8H // .........................................................................*.............. + // gap // ........................................................................................ + mls v19.8H, v27.8H, v7.H[0] // ...........................*............................................................ + // gap // ........................................................................................ + mls v24.8H, v20.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + sub v27.8H, v25.8H, v5.8H // ......................................*................................................. + // gap // ........................................................................................ + mls v4.8H, v11.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + sub v23.8H, v22.8H, v19.8H // ...........................................*............................................ + // gap // ........................................................................................ + mul v20.8H, v27.8H, v0.H[4] // ........................................*............................................... + // gap // ........................................................................................ + sqrdmulh v11.8H, v27.8H, v0.H[5] // .........................................*.............................................. + // gap // ........................................................................................ + add v27.8H, v22.8H, v19.8H // ............................................*........................................... + // gap // ........................................................................................ + mul v22.8H, v23.8H, v0.H[4] // .............................................*.......................................... + // gap // ........................................................................................ + sqrdmulh v23.8H, v23.8H, v0.H[5] // ..............................................*......................................... + // gap // ........................................................................................ + sub v19.8H, v28.8H, v27.8H // .....................................................*.................................. + // gap // ........................................................................................ + add v10.8H, v28.8H, v27.8H // ......................................................*................................. + // gap // ........................................................................................ + mls v20.8H, v11.8H, v7.H[0] // ..........................................*............................................. + // gap // ........................................................................................ + mls v22.8H, v23.8H, v7.H[0] // ...............................................*........................................ + // gap // ........................................................................................ + mul v28.8H, v19.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v19.8H, v0.H[1] // ........................................................*............................... + // gap // ........................................................................................ + sub v14.8H, v3.8H, v20.8H // ..........................................................*............................. + // gap // ........................................................................................ + add v27.8H, v3.8H, v20.8H // ...........................................................*............................ + // gap // ........................................................................................ + ldr q3, [x0, #272] // ....e................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v20.8H, v24.8H, v22.8H // ...............................................................*........................ + // gap // ........................................................................................ + add v21.8H, v24.8H, v22.8H // ................................................................*....................... + // gap // ........................................................................................ + mls v26.8H, v6.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + mul v22.8H, v20.8H, v0.H[0] // .................................................................*...................... + // gap // ........................................................................................ + sqrdmulh v19.8H, v20.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + mls v28.8H, v23.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + ldr q11, [x0, #464] // .......e................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v19.8H, v7.H[0] // ...................................................................*.................... + // gap // ........................................................................................ + str q28, [x0, #320] // .....................................................................*.................. + // gap // ........................................................................................ + mul v23.8H, v10.8H, v29.8H // ...........................................................................*............ + // gap // ........................................................................................ + sqrdmulh v19.8H, v10.8H, v30.8H // ............................................................................*........... + // gap // ........................................................................................ + str q22, [x0, #448] // .......................................................................*................ + // gap // ........................................................................................ + mul v28.8H, v27.8H, v29.8H // ..............................................................................*......... + // gap // ........................................................................................ + sqrdmulh v20.8H, v27.8H, v30.8H // ...............................................................................*........ + // gap // ........................................................................................ + str q26, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + mls v23.8H, v19.8H, v7.H[0] // .............................................................................*.......... + // gap // ........................................................................................ + mul v19.8H, v21.8H, v29.8H // .................................................................................*...... + // gap // ........................................................................................ + sqrdmulh v22.8H, v21.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + mls v28.8H, v20.8H, v7.H[0] // ................................................................................*....... + // gap // ........................................................................................ + ldr q20, [x0, #384] // ......e................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q26, [x0, #320] // .....e.................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q23, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + ldr q23, [x0, #64] // .e...................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.8H, v22.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + str q28, [x0, #112] // ......................................................................................*. + // gap // ........................................................................................ + ldr q28, [x0, #192] // ...e.................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q19, [x0, #176] // .......................................................................................* + // gap // ........................................................................................ + ldr q22, [x0, #128] // ..e..................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v8.8H, v14.8H, v0.H[0] // ............................................................*........................... + // gap // ........................................................................................ + sqrdmulh v27.8H, v14.8H, v0.H[1] // .............................................................*.......................... + // gap // ........................................................................................ + add v24.8H, v22.8H, v28.8H // ..............e......................................................................... + // gap // ........................................................................................ + str q4, [x0, #240] // ....................................................................*................... + // gap // ........................................................................................ + add v25.8H, v3.8H, v26.8H // ...................e.................................................................... + // gap // ........................................................................................ + mls v8.8H, v27.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + add v5.8H, v20.8H, v11.8H // ........................e............................................................... + // gap // ........................................................................................ + ldr q27, [x0, #0] // e....................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q8, [x0, #368] // ......................................................................*................. + // gap // ........................................................................................ + add v14.8H, v25.8H, v5.8H // .......................................e................................................ + // gap // ........................................................................................ + + // original source code + // ldr q8, [x0, #0] // ....................................e..|....................................................................................e. + // ldr q9, [x0, #(1*(512/8))] // .......................e...............|.......................................................................e.............. + // ldr q10, [x0, #(2*(512/8))] // ............................e..........|............................................................................e......... + // ldr q11, [x0, #(3*(512/8))] // ..........................e............|..........................................................................e........... + // ldr q12, [x0, #(4*(512/8))] // e......................................|................................................e..................................... + // ldr q13, [x0, #(5*(512/8))] // .....................e.................|.....................................................................e................ + // ldr q14, [x0, #(6*(512/8))] // ....................e..................|....................................................................e................. + // ldr q15, [x0, #(7*(512/8))] // .......e...............................|.......................................................e.............................. + // sub v24.8h, v8.8h, v9.8h // .......................................*...................................................................................... + // add v8.8h, v8.8h, v9.8h // .......................................|*..................................................................................... + // mul v9.8h, v24.8h, v0.h[6] // .......................................|..*................................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // .......................................|...*.................................................................................. + // mls v9.8h, v24.8h, v7.h[0] // .......................................|........*............................................................................. + // sub v24.8h, v10.8h, v11.8h // .......................................|.*.................................................................................... + // add v10.8h, v10.8h, v11.8h // ...............................e.......|...............................................................................e...... + // mul v11.8h, v24.8h, v1.h[0] // .......................................|......*............................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // .......................................|.......*.............................................................................. + // mls v11.8h, v24.8h, v7.h[0] // .......................................|..............*....................................................................... + // sub v24.8h, v12.8h, v13.8h // .......................................|.........*............................................................................ + // add v12.8h, v12.8h, v13.8h // .................................e.....|.................................................................................e.... + // mul v13.8h, v24.8h, v1.h[2] // .......................................|...............*...................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // .......................................|................*..................................................................... + // mls v13.8h, v24.8h, v7.h[0] // .......................................|....................*................................................................. + // sub v24.8h, v14.8h, v15.8h // .......................................|.................*.................................................................... + // add v14.8h, v14.8h, v15.8h // ...................................e...|...................................................................................e.. + // mul v15.8h, v24.8h, v1.h[4] // .......................................|.....................*................................................................ + // sqrdmulh v24.8h, v24.8h, v1.h[5] // .......................................|.......................*.............................................................. + // mls v15.8h, v24.8h, v7.h[0] // .......................................|..............................*....................................................... + // sub v24.8h, v8.8h, v10.8h // .......................................|....*................................................................................. + // add v8.8h, v8.8h, v10.8h // .......................................|.....*................................................................................ + // mul v10.8h, v24.8h, v0.h[2] // .......................................|..........*........................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................|...........*.......................................................................... + // mls v10.8h, v24.8h, v7.h[0] // .......................................|......................*............................................................... + // sub v24.8h, v9.8h, v11.8h // .......................................|..................*................................................................... + // add v9.8h, v9.8h, v11.8h // .......................................|...................*.................................................................. + // mul v11.8h, v24.8h, v0.h[2] // .......................................|........................*............................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................|.........................*............................................................ + // mls v11.8h, v24.8h, v7.h[0] // .......................................|...............................*...................................................... + // sub v24.8h, v12.8h, v14.8h // .......................................|................................*..................................................... + // add v12.8h, v12.8h, v14.8h // ......................................e|...................................................................................... + // mul v14.8h, v24.8h, v0.h[4] // .......................................|...................................*.................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......................................|....................................*................................................. + // mls v14.8h, v24.8h, v7.h[0] // .......................................|..........................................*........................................... + // sub v24.8h, v13.8h, v15.8h // .......................................|..................................*................................................... + // add v13.8h, v13.8h, v15.8h // .......................................|.....................................*................................................ + // mul v15.8h, v24.8h, v0.h[4] // .......................................|......................................*............................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......................................|.......................................*.............................................. + // mls v15.8h, v24.8h, v7.h[0] // .......................................|...........................................*.......................................... + // sub v24.8h, v8.8h, v12.8h // .......................................|............*......................................................................... + // add v8.8h, v8.8h, v12.8h // .......................................|.............*........................................................................ + // mul v12.8h, v24.8h, v0.h[0] // .......................................|..........................*........................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................|...........................*.......................................................... + // mls v12.8h, v24.8h, v7.h[0] // .......................................|.................................*.................................................... + // sub v24.8h, v9.8h, v13.8h // .......................................|........................................*............................................. + // add v9.8h, v9.8h, v13.8h // .......................................|.........................................*............................................ + // mul v13.8h, v24.8h, v0.h[0] // .......................................|............................................*......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................|.............................................*........................................ + // mls v13.8h, v24.8h, v7.h[0] // ......*................................|......................................................*............................... + // sub v24.8h, v10.8h, v14.8h // .......................................|..............................................*....................................... + // add v10.8h, v10.8h, v14.8h // .......................................|...............................................*...................................... + // mul v14.8h, v24.8h, v0.h[0] // .............................*.........|.............................................................................*........ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................*........|..............................................................................*....... + // mls v14.8h, v24.8h, v7.h[0] // ..................................*....|..................................................................................*... + // sub v24.8h, v11.8h, v15.8h // .*.....................................|.................................................*.................................... + // add v11.8h, v11.8h, v15.8h // ..*....................................|..................................................*................................... + // mul v15.8h, v24.8h, v0.h[0] // ....*..................................|....................................................*................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....*.................................|.....................................................*................................ + // mls v15.8h, v24.8h, v7.h[0] // ........*..............................|........................................................*............................. + // str q12, [x0, #(4*(512/8))] // ................................*......|................................................................................*..... + // str q13, [x0, #(5*(512/8))] // .........*.............................|.........................................................*............................ + // str q14, [x0, #(6*(512/8))] // .....................................*.|.....................................................................................* + // str q15, [x0, #(7*(512/8))] // ............*..........................|............................................................*......................... + // mul v12.8h, v8.8h, v29.8h // .......................................|............................*......................................................... + // sqrdmulh v8.8h, v8.8h, v30.8h // .......................................|.............................*........................................................ + // mls v12.8h, v8.8h, v7.h[0] // ...*...................................|...................................................*.................................. + // mul v13.8h, v9.8h, v29.8h // ..........*............................|..........................................................*........................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ...........*...........................|...........................................................*.......................... + // mls v13.8h, v9.8h, v7.h[0] // ................*......................|................................................................*..................... + // mul v14.8h, v10.8h, v29.8h // .............*.........................|.............................................................*........................ + // sqrdmulh v10.8h, v10.8h, v30.8h // ..............*........................|..............................................................*....................... + // mls v14.8h, v10.8h, v7.h[0] // ...................*...................|...................................................................*.................. + // mul v15.8h, v11.8h, v29.8h // .................*.....................|.................................................................*.................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ..................*....................|..................................................................*................... + // mls v15.8h, v11.8h, v7.h[0] // ........................*..............|........................................................................*............. + // str q12, [x0], #(16) // ...............*.......................|...............................................................*...................... + // str q13, [x0, #(-16 + 1*(512/8))] // ......................*................|......................................................................*............... + // str q14, [x0, #(-16 + 2*(512/8))] // .........................*.............|.........................................................................*............ + // str q15, [x0, #(-16 + 3*(512/8))] // ...........................*...........|...........................................................................*.......... + + sub count, count, #1 + cbnz count, layer123_start + sub v10.8H, v22.8H, v28.8H // ..*......................................................................... + // gap // ............................................................................ + sub v18.8H, v27.8H, v23.8H // *........................................................................... + // gap // ............................................................................ + sub v19.8H, v20.8H, v11.8H // ..................*......................................................... + // gap // ............................................................................ + sqrdmulh v22.8H, v10.8H, v1.H[1] // ........*................................................................... + // gap // ............................................................................ + sqrdmulh v15.8H, v18.8H, v0.H[7] // ....*....................................................................... + // gap // ............................................................................ + mul v28.8H, v18.8H, v0.H[6] // ...*........................................................................ + // gap // ............................................................................ + mul v8.8H, v10.8H, v1.H[0] // .......*.................................................................... + // gap // ............................................................................ + sub v25.8H, v25.8H, v5.8H // .................................*.......................................... + // gap // ............................................................................ + mul v10.8H, v19.8H, v1.H[4] // ......................*..................................................... + // gap // ............................................................................ + mls v28.8H, v15.8H, v7.H[0] // .........*.................................................................. + // gap // ............................................................................ + mls v8.8H, v22.8H, v7.H[0] // ...............*............................................................ + // gap // ............................................................................ + sqrdmulh v22.8H, v19.8H, v1.H[5] // ........................*................................................... + // gap // ............................................................................ + sqrdmulh v19.8H, v25.8H, v0.H[5] // .....................................*...................................... + // gap // ............................................................................ + add v11.8H, v27.8H, v23.8H // .*.......................................................................... + // gap // ............................................................................ + sub v20.8H, v28.8H, v8.8H // ...................*........................................................ + // gap // ............................................................................ + mls v10.8H, v22.8H, v7.H[0] // ...............................*............................................ + // gap // ............................................................................ + add v17.8H, v11.8H, v24.8H // ......*..................................................................... + // gap // ............................................................................ + sqrdmulh v27.8H, v20.8H, v0.H[3] // ..........................*................................................. + // gap // ............................................................................ + mul v5.8H, v20.8H, v0.H[2] // .........................*.................................................. + // gap // ............................................................................ + sub v18.8H, v11.8H, v24.8H // .....*...................................................................... + // gap // ............................................................................ + add v23.8H, v17.8H, v14.8H // ..............*............................................................. + // gap // ............................................................................ + mul v16.8H, v25.8H, v0.H[4] // ....................................*....................................... + // gap // ............................................................................ + mls v5.8H, v27.8H, v7.H[0] // ................................*........................................... + // gap // ............................................................................ + sqrdmulh v20.8H, v23.8H, v30.8H // ..............................*............................................. + // gap // ............................................................................ + sqrdmulh v27.8H, v18.8H, v0.H[3] // ............*............................................................... + // gap // ............................................................................ + mul v22.8H, v18.8H, v0.H[2] // ...........*................................................................ + // gap // ............................................................................ + mul v25.8H, v23.8H, v29.8H // .............................*.............................................. + // gap // ............................................................................ + sub v26.8H, v3.8H, v26.8H // ..........*................................................................. + // gap // ............................................................................ + mls v16.8H, v19.8H, v7.H[0] // ...........................................*................................ + // gap // ............................................................................ + mls v22.8H, v27.8H, v7.H[0] // .......................*.................................................... + // gap // ............................................................................ + sqrdmulh v19.8H, v26.8H, v1.H[3] // .................*.......................................................... + // gap // ............................................................................ + mul v11.8H, v26.8H, v1.H[2] // ................*........................................................... + // gap // ............................................................................ + add v26.8H, v28.8H, v8.8H // ....................*....................................................... + // gap // ............................................................................ + sub v3.8H, v22.8H, v16.8H // ...............................................*............................ + // gap // ............................................................................ + add v4.8H, v22.8H, v16.8H // ................................................*........................... + // gap // ............................................................................ + mls v11.8H, v19.8H, v7.H[0] // .....................*...................................................... + // gap // ............................................................................ + mul v19.8H, v3.8H, v0.H[0] // .......................................................................*.... + // gap // ............................................................................ + sqrdmulh v9.8H, v3.8H, v0.H[1] // ........................................................................*... + // gap // ............................................................................ + sqrdmulh v23.8H, v4.8H, v30.8H // .............................................................*.............. + // gap // ............................................................................ + sub v31.8H, v11.8H, v10.8H // ...................................*........................................ + // gap // ............................................................................ + add v3.8H, v11.8H, v10.8H // ......................................*..................................... + // gap // ............................................................................ + mul v8.8H, v4.8H, v29.8H // ............................................................*............... + // gap // ............................................................................ + mul v24.8H, v31.8H, v0.H[4] // .......................................*.................................... + // gap // ............................................................................ + sub v28.8H, v26.8H, v3.8H // .........................................*.................................. + // gap // ............................................................................ + sqrdmulh v27.8H, v31.8H, v0.H[5] // ........................................*................................... + // gap // ............................................................................ + mls v8.8H, v23.8H, v7.H[0] // ..................................................................*......... + // gap // ............................................................................ + mul v22.8H, v28.8H, v0.H[0] // .............................................*.............................. + // gap // ............................................................................ + sqrdmulh v28.8H, v28.8H, v0.H[1] // ..............................................*............................. + // gap // ............................................................................ + mls v24.8H, v27.8H, v7.H[0] // ............................................*............................... + // gap // ............................................................................ + mls v25.8H, v20.8H, v7.H[0] // ...................................................*........................ + // gap // ............................................................................ + add v11.8H, v26.8H, v3.8H // ..........................................*................................. + // gap // ............................................................................ + mls v22.8H, v28.8H, v7.H[0] // ......................................................*..................... + // gap // ............................................................................ + sub v23.8H, v5.8H, v24.8H // .................................................*.......................... + // gap // ............................................................................ + mul v3.8H, v11.8H, v29.8H // .........................................................*.................. + // gap // ............................................................................ + add v28.8H, v5.8H, v24.8H // ..................................................*......................... + // gap // ............................................................................ + sqrdmulh v24.8H, v23.8H, v0.H[1] // .....................................................*...................... + // gap // ............................................................................ + mul v20.8H, v23.8H, v0.H[0] // ....................................................*....................... + // gap // ............................................................................ + mul v27.8H, v28.8H, v29.8H // ................................................................*........... + // gap // ............................................................................ + sqrdmulh v23.8H, v28.8H, v30.8H // .................................................................*.......... + // gap // ............................................................................ + str q22, [x0, #320] // ........................................................*................... + // gap // ............................................................................ + mls v20.8H, v24.8H, v7.H[0] // .......................................................*.................... + // gap // ............................................................................ + sub v22.8H, v17.8H, v14.8H // .............*.............................................................. + // gap // ............................................................................ + mls v27.8H, v23.8H, v7.H[0] // ....................................................................*....... + // gap // ............................................................................ + str q8, [x0, #128] // .....................................................................*...... + // gap // ............................................................................ + sqrdmulh v26.8H, v11.8H, v30.8H // ..........................................................*................. + // gap // ............................................................................ + str q20, [x0, #448] // ...........................................................*................ + // gap // ............................................................................ + sqrdmulh v28.8H, v22.8H, v0.H[1] // ............................*............................................... + // gap // ............................................................................ + mul v24.8H, v22.8H, v0.H[0] // ...........................*................................................ + // gap // ............................................................................ + str q27, [x0, #192] // ......................................................................*..... + // gap // ............................................................................ + mls v3.8H, v26.8H, v7.H[0] // ...............................................................*............ + // gap // ............................................................................ + str q25, [x0], #(16) // ..............................................................*............. + // gap // ............................................................................ + mls v24.8H, v28.8H, v7.H[0] // ..................................*......................................... + // gap // ............................................................................ + mls v19.8H, v9.8H, v7.H[0] // ..........................................................................*. + // gap // ............................................................................ + str q3, [x0, #48] // ...................................................................*........ + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q24, [x0, #240] // .........................................................................*.. + // gap // ............................................................................ + // gap // ............................................................................ + // gap // ............................................................................ + str q19, [x0, #368] // ...........................................................................* + // gap // ............................................................................ + + // original source code + // sub v19.8H, v27.8H, v23.8H // .*.......................................................................... + // add v23.8H, v27.8H, v23.8H // .............*.............................................................. + // sub v22.8H, v22.8H, v28.8H // *........................................................................... + // mul v28.8H, v19.8H, v0.H[6] // .....*...................................................................... + // sqrdmulh v19.8H, v19.8H, v0.H[7] // ....*....................................................................... + // sub v27.8H, v23.8H, v24.8H // ...................*........................................................ + // add v23.8H, v23.8H, v24.8H // ................*........................................................... + // mul v24.8H, v22.8H, v1.H[0] // ......*..................................................................... + // sqrdmulh v22.8H, v22.8H, v1.H[1] // ...*........................................................................ + // mls v28.8H, v19.8H, v7.H[0] // .........*.................................................................. + // sub v19.8H, v3.8H, v26.8H // ...........................*................................................ + // mul v3.8H, v27.8H, v0.H[2] // .........................*.................................................. + // sqrdmulh v27.8H, v27.8H, v0.H[3] // ........................*................................................... + // sub v26.8H, v23.8H, v14.8H // .............................................................*.............. + // add v23.8H, v23.8H, v14.8H // ....................*....................................................... + // mls v24.8H, v22.8H, v7.H[0] // ..........*................................................................. + // mul v22.8H, v19.8H, v1.H[2] // ...............................*............................................ + // sqrdmulh v19.8H, v19.8H, v1.H[3] // ..............................*............................................. + // sub v20.8H, v20.8H, v11.8H // ..*......................................................................... + // sub v11.8H, v28.8H, v24.8H // ..............*............................................................. + // add v28.8H, v28.8H, v24.8H // ................................*........................................... + // mls v22.8H, v19.8H, v7.H[0] // ...................................*........................................ + // mul v19.8H, v20.8H, v1.H[4] // ........*................................................................... + // mls v3.8H, v27.8H, v7.H[0] // .............................*.............................................. + // sqrdmulh v27.8H, v20.8H, v1.H[5] // ...........*................................................................ + // mul v24.8H, v11.8H, v0.H[2] // ..................*......................................................... + // sqrdmulh v20.8H, v11.8H, v0.H[3] // .................*.......................................................... + // mul v4.8H, v26.8H, v0.H[0] // ...................................................................*........ + // sqrdmulh v11.8H, v26.8H, v0.H[1] // ..................................................................*......... + // mul v26.8H, v23.8H, v29.8H // ..........................*................................................. + // sqrdmulh v6.8H, v23.8H, v30.8H // .......................*.................................................... + // mls v19.8H, v27.8H, v7.H[0] // ...............*............................................................ + // mls v24.8H, v20.8H, v7.H[0] // ......................*..................................................... + // sub v27.8H, v25.8H, v5.8H // .......*.................................................................... + // mls v4.8H, v11.8H, v7.H[0] // .......................................................................*.... + // sub v23.8H, v22.8H, v19.8H // .......................................*.................................... + // mul v20.8H, v27.8H, v0.H[4] // .....................*...................................................... + // sqrdmulh v11.8H, v27.8H, v0.H[5] // ............*............................................................... + // add v27.8H, v22.8H, v19.8H // ........................................*................................... + // mul v22.8H, v23.8H, v0.H[4] // ..........................................*................................. + // sqrdmulh v23.8H, v23.8H, v0.H[5] // ............................................*............................... + // sub v19.8H, v28.8H, v27.8H // ...........................................*................................ + // add v10.8H, v28.8H, v27.8H // ..................................................*......................... + // mls v20.8H, v11.8H, v7.H[0] // ............................*............................................... + // mls v22.8H, v23.8H, v7.H[0] // ................................................*........................... + // mul v28.8H, v19.8H, v0.H[0] // ..............................................*............................. + // sqrdmulh v23.8H, v19.8H, v0.H[1] // ...............................................*............................ + // sub v14.8H, v3.8H, v20.8H // .................................*.......................................... + // add v27.8H, v3.8H, v20.8H // ..................................*......................................... + // sub v20.8H, v24.8H, v22.8H // ....................................................*....................... + // add v21.8H, v24.8H, v22.8H // ......................................................*..................... + // mls v26.8H, v6.8H, v7.H[0] // .................................................*.......................... + // mul v22.8H, v20.8H, v0.H[0] // ........................................................*................... + // sqrdmulh v19.8H, v20.8H, v0.H[1] // .......................................................*.................... + // mls v28.8H, v23.8H, v7.H[0] // ...................................................*........................ + // mls v22.8H, v19.8H, v7.H[0] // ............................................................*............... + // str q28, [x0, #320] // ...........................................................*................ + // mul v23.8H, v10.8H, v29.8H // .....................................................*...................... + // sqrdmulh v19.8H, v10.8H, v30.8H // ................................................................*........... + // str q22, [x0, #448] // .................................................................*.......... + // mul v28.8H, v27.8H, v29.8H // .........................................*.................................. + // sqrdmulh v20.8H, v27.8H, v30.8H // ......................................*..................................... + // str q26, [x0], #(16) // ......................................................................*..... + // mls v23.8H, v19.8H, v7.H[0] // .....................................................................*...... + // mul v19.8H, v21.8H, v29.8H // .........................................................*.................. + // sqrdmulh v22.8H, v21.8H, v30.8H // ..........................................................*................. + // mls v28.8H, v20.8H, v7.H[0] // .............................................*.............................. + // str q23, [x0, #48] // .........................................................................*.. + // mls v19.8H, v22.8H, v7.H[0] // ..............................................................*............. + // str q28, [x0, #112] // ...............................................................*............ + // str q19, [x0, #176] // ....................................................................*....... + // mul v8.8H, v14.8H, v0.H[0] // ....................................*....................................... + // sqrdmulh v27.8H, v14.8H, v0.H[1] // .....................................*...................................... + // str q4, [x0, #240] // ..........................................................................*. + // mls v8.8H, v27.8H, v7.H[0] // ........................................................................*... + // str q8, [x0, #368] // ...........................................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a72.s b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a72.s new file mode 100644 index 00000000..669bcee5 --- /dev/null +++ b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_a72.s @@ -0,0 +1,1823 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_manual_ld4_opt_a72 + .global _intt_kyber_123_4567_manual_ld4_opt_a72 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_manual_ld4_opt_a72: +_intt_kyber_123_4567_manual_ld4_opt_a72: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q2, [x4, #64] // ..........*......................................... + ld4 {v17.4S, v18.4S, v19.4S, v20.4S}, [x1] // *................................................... + ldr q31, [x4, #32] // ............*....................................... + ldr q21, [x4, #16] // ....*............................................... + ldr q23, [x4, #48] // .*.................................................. + // gap // .................................................... + ldr q10, [x4, #80] // ..*................................................. + ldr q27, [x4], #(6*16) // ...*................................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v12.8H, v17.8H, v18.8H // .......*............................................ + add v9.8H, v19.8H, v20.8H // ......*............................................. + // gap // .................................................... + sub v3.8H, v19.8H, v20.8H // .....*.............................................. + add v20.8H, v17.8H, v18.8H // ........*........................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v29.8H, v12.8H, v31.8H // ................*................................... + // gap // .................................................... + // gap // .................................................... + add v4.8H, v20.8H, v9.8H // ..............*..................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v15.8H, v12.8H, v23.8H // ...........*........................................ + sub v1.8H, v20.8H, v9.8H // .............*...................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v13.8H, v3.8H, v10.8H // .........*.......................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v28.8H, v3.8H, v2.8H // ...............*.................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v29.8H, v15.8H, v7.H[0] // .................*.................................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v28.8H, v13.8H, v7.H[0] // ..................*................................. + ldr q13, [x3], #16 // .........................................*.......... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v10.8H, v1.8H, v27.8H // ......................*............................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v17.8H, v29.8H, v28.8H // ....................*............................... + add v23.8H, v29.8H, v28.8H // .....................*.............................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v29.8H, v1.8H, v21.8H // ...................*................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + trn1 v22.4S, v4.4S, v23.4S // ........................*........................... + sqrdmulh v18.8H, v17.8H, v21.8H // .......................*............................ + // gap // .................................................... + trn2 v1.4S, v4.4S, v23.4S // ..........................*......................... + // gap // .................................................... + // gap // .................................................... + mul v16.8H, v17.8H, v27.8H // .........................*.......................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v10.8H, v29.8H, v7.H[0] // ...........................*........................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v16.8H, v18.8H, v7.H[0] // ............................*....................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + trn2 v21.4S, v10.4S, v16.4S // .............................*...................... + trn1 v10.4S, v10.4S, v16.4S // ..............................*..................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + trn2 v14.2D, v1.2D, v21.2D // ..................................*................. + trn2 v6.2D, v22.2D, v10.2D // .................................*.................. + // gap // .................................................... + trn1 v17.2D, v22.2D, v10.2D // ................................*................... + trn1 v24.2D, v1.2D, v21.2D // ...............................*.................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + add v18.8H, v6.8H, v14.8H // .....................................*.............. + // gap // .................................................... + // gap // .................................................... + add v23.8H, v17.8H, v24.8H // ...................................*................ + sub v30.8H, v17.8H, v24.8H // ....................................*............... + // gap // .................................................... + sub v8.8H, v6.8H, v14.8H // .............................................*...... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqdmulh v4.8H, v18.8H, v7.H[1] // .......................................*............ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqdmulh v19.8H, v23.8H, v7.H[1] // ......................................*............. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + srshr v11.8H, v4.8H, #11 // ..........................................*......... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + srshr v17.8H, v19.8H, #11 // ........................................*........... + // gap // .................................................... + // gap // .................................................... + mls v18.8H, v11.8H, v7.H[0] // ............................................*....... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v23.8H, v17.8H, v7.H[0] // ...........................................*........ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v11.8H, v8.8H, v13.H[5] // .................................................*.. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v26.8H, v23.8H, v18.8H // ................................................*... + add v16.8H, v23.8H, v18.8H // ...............................................*.... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v23.8H, v30.8H, v13.H[3] // ...................................................* + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + str q16, [x1], #(64) // ..................................................*. + mul v12.8H, v8.8H, v13.H[4] // ..............................................*..... + // gap // .................................................... + + // original source code + // ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // .*.................................................. + // ldr q27, [x4, #48] // ....*............................................... + // ldr q24, [x4, #80] // .....*.............................................. + // ldr q11, [x4], #(6*16) // ......*............................................. + // ldr q8, [x4, #-80] // ...*................................................ + // sub v28.8H, v5.8H, v6.8H // .........*.......................................... + // add v29.8H, v5.8H, v6.8H // ........*........................................... + // sub v6.8H, v3.8H, v4.8H // .......*............................................ + // add v0.8H, v3.8H, v4.8H // ..........*......................................... + // sqrdmulh v3.8H, v28.8H, v24.8H // ...............*.................................... + // ldr q18, [x4, #-32] // *................................................... + // sqrdmulh v25.8H, v6.8H, v27.8H // .............*...................................... + // ldr q27, [x4, #-64] // ..*................................................. + // sub v19.8H, v0.8H, v29.8H // ..............*..................................... + // add v15.8H, v0.8H, v29.8H // ............*....................................... + // mul v29.8H, v28.8H, v18.8H // ................*................................... + // mul v27.8H, v6.8H, v27.8H // ...........*........................................ + // mls v27.8H, v25.8H, v7.H[0] // .................*.................................. + // mls v29.8H, v3.8H, v7.H[0] // ..................*................................. + // sqrdmulh v18.8H, v19.8H, v8.8H // .......................*............................ + // sub v10.8H, v27.8H, v29.8H // .....................*.............................. + // add v24.8H, v27.8H, v29.8H // ......................*............................. + // mul v0.8H, v19.8H, v11.8H // ....................*............................... + // sqrdmulh v4.8H, v10.8H, v8.8H // .........................*.......................... + // trn1 v9.4S, v15.4S, v24.4S // ........................*........................... + // mul v25.8H, v10.8H, v11.8H // ...........................*........................ + // trn2 v24.4S, v15.4S, v24.4S // ..........................*......................... + // mls v0.8H, v18.8H, v7.H[0] // ............................*....................... + // mls v25.8H, v4.8H, v7.H[0] // .............................*...................... + // trn2 v11.4S, v0.4S, v25.4S // ..............................*..................... + // trn1 v27.4S, v0.4S, v25.4S // ...............................*.................... + // trn1 v20.2D, v24.2D, v11.2D // ...................................*................ + // trn1 v30.2D, v9.2D, v27.2D // ..................................*................. + // trn2 v15.2D, v9.2D, v27.2D // .................................*.................. + // trn2 v16.2D, v24.2D, v11.2D // ................................*................... + // add v29.8H, v30.8H, v20.8H // .....................................*.............. + // sub v30.8H, v30.8H, v20.8H // ......................................*............. + // add v25.8H, v15.8H, v16.8H // ....................................*............... + // sqdmulh v24.8H, v29.8H, v7.H[1] // .........................................*.......... + // sqdmulh v26.8H, v25.8H, v7.H[1] // ........................................*........... + // srshr v24.8H, v24.8H, #11 // ...........................................*........ + // ldr q13, [x3], #16 // ...................*................................ + // srshr v23.8H, v26.8H, #11 // ..........................................*......... + // mls v29.8H, v24.8H, v7.H[0] // .............................................*...... + // mls v25.8H, v23.8H, v7.H[0] // ............................................*....... + // sub v14.8H, v15.8H, v16.8H // .......................................*............ + // mul v12.8H, v14.8H, v13.H[4] // ...................................................* + // add v28.8H, v29.8H, v25.8H // ................................................*... + // sub v26.8H, v29.8H, v25.8H // ...............................................*.... + // sqrdmulh v11.8H, v14.8H, v13.H[5] // ..............................................*..... + // str q28, [x1], #(64) // ..................................................*. + // sqrdmulh v23.8H, v30.8H, v13.H[3] // .................................................*.. + + sub count, count, #1 +layer4567_start: + ld4 {v3.4S, v4.4S, v5.4S, v6.4S}, [x1] // e....................................................................... + ldr q27, [x4, #48] // ....e................................................................... + // gap // ........................................................................ + ldr q24, [x4, #80] // ......e................................................................. + mul v22.8H, v30.8H, v13.H[2] // ......................................*................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v12.8H, v11.8H, v7.H[0] // .............................................*.......................... + ldr q11, [x4], #(6*16) // .e...................................................................... + // gap // ........................................................................ + ldr q8, [x4, #-80] // ..e..................................................................... + sub v28.8H, v5.8H, v6.8H // ............e........................................................... + // gap // ........................................................................ + add v29.8H, v5.8H, v6.8H // .............e.......................................................... + mls v22.8H, v23.8H, v7.H[0] // ........................................*............................... + // gap // ........................................................................ + sub v6.8H, v3.8H, v4.8H // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v0.8H, v3.8H, v4.8H // ........e............................................................... + sqrdmulh v3.8H, v28.8H, v24.8H // ...............e........................................................ + ldr q18, [x4, #-32] // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v25.8H, v6.8H, v27.8H // ..........e............................................................. + ldr q27, [x4, #-64] // ...e.................................................................... + // gap // ........................................................................ + sub v19.8H, v0.8H, v29.8H // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v15.8H, v0.8H, v29.8H // ..................e..................................................... + mul v29.8H, v28.8H, v18.8H // ..............e......................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v27.8H, v6.8H, v27.8H // .........e.............................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v27.8H, v25.8H, v7.H[0] // ...........e............................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v29.8H, v3.8H, v7.H[0] // ................e....................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v28.8H, v12.8H, v7.H[1] // .......................................................*................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v18.8H, v19.8H, v8.8H // ....................e................................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v10.8H, v27.8H, v29.8H // ......................e................................................. + // gap // ........................................................................ + // gap // ........................................................................ + add v24.8H, v27.8H, v29.8H // .......................e................................................ + // gap // ........................................................................ + mul v0.8H, v19.8H, v11.8H // ...................e.................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v4.8H, v10.8H, v8.8H // .........................e.............................................. + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v9.4S, v15.4S, v24.4S // ...........................e............................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v25.8H, v10.8H, v11.8H // ........................e............................................... + trn2 v24.4S, v15.4S, v24.4S // ............................e........................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v0.8H, v18.8H, v7.H[0] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v25.8H, v4.8H, v7.H[0] // ..........................e............................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v3.8H, v22.8H, v7.H[1] // .................................................*...................... + // gap // ........................................................................ + // gap // ........................................................................ + srshr v27.8H, v28.8H, #11 // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + mul v14.8H, v26.8H, v13.H[0] // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v11.4S, v0.4S, v25.4S // ..............................e......................................... + // gap // ........................................................................ + // gap // ........................................................................ + mls v12.8H, v27.8H, v7.H[0] // .........................................................*.............. + trn1 v27.4S, v0.4S, v25.4S // .............................e.......................................... + // gap // ........................................................................ + srshr v3.8H, v3.8H, #11 // ..................................................*..................... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v2.8H, v26.8H, v13.H[1] // .............................................................*.......... + trn1 v20.2D, v24.2D, v11.2D // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v30.2D, v9.2D, v27.2D // .................................e...................................... + trn2 v15.2D, v9.2D, v27.2D // ...............................e........................................ + trn2 v16.2D, v24.2D, v11.2D // ................................e....................................... + // gap // ........................................................................ + mls v22.8H, v3.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v29.8H, v30.8H, v20.8H // .....................................e.................................. + mls v14.8H, v2.8H, v7.H[0] // ..............................................................*......... + sub v30.8H, v30.8H, v20.8H // ....................................e................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v25.8H, v15.8H, v16.8H // ..........................................e............................. + sqdmulh v24.8H, v29.8H, v7.H[1] // ..............................................e......................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v28.8H, v22.8H, v12.8H // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + add v11.8H, v22.8H, v12.8H // ................................................................*....... + // gap // ........................................................................ + sqdmulh v26.8H, v25.8H, v7.H[1] // ....................................................e................... + str q14, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v0.8H, v28.8H, v13.H[1] // ..................................................................*..... + srshr v24.8H, v24.8H, #11 // ...............................................e........................ + str q11, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + mul v10.8H, v28.8H, v13.H[0] // .................................................................*...... + ldr q13, [x3], #16 // ...................................e.................................... + // gap // ........................................................................ + srshr v23.8H, v26.8H, #11 // .....................................................e.................. + // gap // ........................................................................ + // gap // ........................................................................ + mls v29.8H, v24.8H, v7.H[0] // ................................................e....................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v25.8H, v23.8H, v7.H[0] // ......................................................e................. + // gap // ........................................................................ + // gap // ........................................................................ + sub v14.8H, v15.8H, v16.8H // .........................................e.............................. + // gap // ........................................................................ + // gap // ........................................................................ + mls v10.8H, v0.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v12.8H, v14.8H, v13.H[4] // ...........................................e............................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v28.8H, v29.8H, v25.8H // ...........................................................e............ + sub v26.8H, v29.8H, v25.8H // ..........................................................e............. + sqrdmulh v11.8H, v14.8H, v13.H[5] // ............................................e........................... + // gap // ........................................................................ + str q10, [x1, #-16] // .......................................................................* + // gap // ........................................................................ + // gap // ........................................................................ + str q28, [x1], #(64) // ....................................................................e... + // gap // ........................................................................ + sqrdmulh v23.8H, v30.8H, v13.H[3] // .......................................e................................ + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................e..................................................................... + // ldr q0, [x4], #(6*16) // .....e..................................................................|....e................................................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // ......e.................................................................|.....e............................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // ...............e........................................................|..............e...................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .e......................................................................|e.................................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // .............e..........................................................|............e........................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // ..e.....................................................................|.e................................................................... + // sub v24.8h, v8.8h, v9.8h // ..........e.............................................................|.........e........................................................... + // add v8.8h, v8.8h, v9.8h // ...........e............................................................|..........e.......................................................... + // mul v9.8h, v24.8h, v1.8h // ...................e....................................................|..................e.................................................. + // sqrdmulh v24.8h, v24.8h, v5.8h // ..............e.........................................................|.............e....................................................... + // mls v9.8h, v24.8h, v7.h[0] // ....................e...................................................|...................e................................................. + // sub v24.8h, v10.8h, v11.8h // .......e................................................................|......e.............................................................. + // add v10.8h, v10.8h, v11.8h // ........e...............................................................|.......e............................................................. + // mul v11.8h, v24.8h, v2.8h // ..................e.....................................................|.................e................................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ............e...........................................................|...........e......................................................... + // mls v11.8h, v24.8h, v7.h[0] // .....................e..................................................|....................e................................................ + // sub v24.8h, v8.8h, v10.8h // ................e.......................................................|...............e..................................................... + // add v8.8h, v8.8h, v10.8h // .................e......................................................|................e.................................................... + // mul v10.8h, v24.8h, v0.8h // ..........................e.............................................|.........................e........................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // .......................e................................................|......................e.............................................. + // mls v10.8h, v24.8h, v7.h[0] // ...............................e........................................|..............................e...................................... + // sub v24.8h, v9.8h, v11.8h // ........................e...............................................|.......................e............................................. + // add v9.8h, v9.8h, v11.8h // .........................e..............................................|........................e............................................ + // mul v11.8h, v24.8h, v0.8h // .............................e..........................................|............................e........................................ + // sqrdmulh v24.8h, v24.8h, v4.8h // ...........................e............................................|..........................e.......................................... + // mls v11.8h, v24.8h, v7.h[0] // ................................e.......................................|...............................e..................................... + // trn1 v25.4s, v8.4s, v9.4s // ............................e...........................................|...........................e......................................... + // trn2 v26.4s, v8.4s, v9.4s // ..............................e.........................................|.............................e....................................... + // trn1 v27.4s, v10.4s, v11.4s // ......................................e.................................|.....................................e............................... + // trn2 v28.4s, v10.4s, v11.4s // ....................................e...................................|...................................e................................. + // trn2 v10.2d, v25.2d, v27.2d // ...........................................e............................|..........................................e.......................... + // trn2 v11.2d, v26.2d, v28.2d // ............................................e...........................|...........................................e......................... + // trn1 v8.2d, v25.2d, v27.2d // ..........................................e.............................|.........................................e........................... + // trn1 v9.2d, v26.2d, v28.2d // .........................................e..............................|........................................e............................ + // ldr q0, [x3], #16 // ...........................................................e............|..........................................................e.......... + // sub v24.8h, v8.8h, v9.8h // ................................................e.......................|...............................................e..................... + // add v8.8h, v8.8h, v9.8h // ..............................................e.........................|.............................................e....................... + // mul v9.8h, v24.8h, v0.h[2] // ...*....................................................................|..*.................................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................................................e|..................................................................... + // mls v9.8h, v24.8h, v7.h[0] // .........*..............................................................|........*............................................................ + // sub v24.8h, v10.8h, v11.8h // ...............................................................e........|..............................................................e...... + // add v10.8h, v10.8h, v11.8h // .................................................e......................|................................................e.................... + // mul v11.8h, v24.8h, v0.h[4] // .................................................................e......|................................................................e.... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ....................................................................e...|...................................................................e. + // mls v11.8h, v24.8h, v7.h[0] // ....*...................................................................|...*................................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ..................................................e.....................|.................................................e................... + // srshr v25.8h, v25.8h, #11 // ........................................................e...............|.......................................................e............. + // mls v8.8h, v25.8h, v7.h[0] // .............................................................e..........|............................................................e........ + // sqdmulh v25.8h, v9.8h, v7.h[1] // .................................*......................................|................................*.................................... + // srshr v25.8h, v25.8h, #11 // .......................................*................................|......................................*.............................. + // mls v9.8h, v25.8h, v7.h[0] // .............................................*..........................|............................................*........................ + // sqdmulh v25.8h, v10.8h, v7.h[1] // .....................................................e..................|....................................................e................ + // srshr v25.8h, v25.8h, #11 // ............................................................e...........|...........................................................e......... + // mls v10.8h, v25.8h, v7.h[0] // ..............................................................e.........|.............................................................e....... + // sqdmulh v25.8h, v11.8h, v7.h[1] // ......................*.................................................|.....................*............................................... + // srshr v25.8h, v25.8h, #11 // ..................................*.....................................|.................................*................................... + // mls v11.8h, v25.8h, v7.h[0] // .....................................*..................................|....................................*................................ + // sub v24.8h, v8.8h, v10.8h // ...................................................................e....|..................................................................e.. + // add v8.8h, v8.8h, v10.8h // ..................................................................e.....|.................................................................e... + // mul v10.8h, v24.8h, v0.h[0] // ...................................*....................................|..................................*.................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................*...............................|.......................................*............................. + // mls v10.8h, v24.8h, v7.h[0] // ...............................................*........................|..............................................*...................... + // sub v24.8h, v9.8h, v11.8h // ...................................................*....................|..................................................*.................. + // add v9.8h, v9.8h, v11.8h // ....................................................*...................|...................................................*................. + // mul v11.8h, v24.8h, v0.h[0] // ..........................................................*.............|.........................................................*........... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................................*................|......................................................*.............. + // mls v11.8h, v24.8h, v7.h[0] // ................................................................*.......|...............................................................*..... + // str q8, [x1], #(64) // ......................................................................e.|..................................................................... + // str q9, [x1, #(-64 + 16*1)] // .........................................................*..............|........................................................*............ + // str q10, [x1, #(-64 + 16*2)] // ......................................................*.................|.....................................................*............... + // str q11, [x1, #(-64 + 16*3)] // .....................................................................*..|....................................................................* + + sub count, count, #1 + cbnz count, layer4567_start + mul v19.8H, v30.8H, v13.H[2] // *................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mls v12.8H, v11.8H, v7.H[0] // .*.................. + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mls v19.8H, v23.8H, v7.H[0] // ..*................. + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + sqdmulh v4.8H, v12.8H, v7.H[1] // ...*................ + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + sqdmulh v28.8H, v19.8H, v7.H[1] // ....*............... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + sqrdmulh v8.8H, v26.8H, v13.H[1] // .........*.......... + // gap // .................... + // gap // .................... + srshr v21.8H, v4.8H, #11 // .....*.............. + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + srshr v22.8H, v28.8H, #11 // ........*........... + // gap // .................... + // gap // .................... + mls v12.8H, v21.8H, v7.H[0] // .......*............ + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mls v19.8H, v22.8H, v7.H[0] // ..........*......... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mul v23.8H, v26.8H, v13.H[0] // ......*............. + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mls v23.8H, v8.8H, v7.H[0] // ...........*........ + // gap // .................... + // gap // .................... + sub v24.8H, v19.8H, v12.8H // ............*....... + // gap // .................... + // gap // .................... + add v25.8H, v19.8H, v12.8H // .............*...... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + sqrdmulh v26.8H, v24.8H, v13.H[1] // ...............*.... + // gap // .................... + // gap // .................... + str q25, [x1, #-48] // ................*... + // gap // .................... + // gap // .................... + mul v5.8H, v24.8H, v13.H[0] // .................*.. + str q23, [x1, #-32] // ..............*..... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + mls v5.8H, v26.8H, v7.H[0] // ..................*. + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + // gap // .................... + str q5, [x1, #-16] // ...................* + // gap // .................... + // gap // .................... + + // original source code + // mul v22.8H, v30.8H, v13.H[2] // *................... + // mls v12.8H, v11.8H, v7.H[0] // .*.................. + // mls v22.8H, v23.8H, v7.H[0] // ..*................. + // sqdmulh v28.8H, v12.8H, v7.H[1] // ...*................ + // sqdmulh v3.8H, v22.8H, v7.H[1] // ....*............... + // srshr v27.8H, v28.8H, #11 // ......*............. + // mul v14.8H, v26.8H, v13.H[0] // ..........*......... + // mls v12.8H, v27.8H, v7.H[0] // ........*........... + // srshr v3.8H, v3.8H, #11 // .......*............ + // sqrdmulh v2.8H, v26.8H, v13.H[1] // .....*.............. + // mls v22.8H, v3.8H, v7.H[0] // .........*.......... + // mls v14.8H, v2.8H, v7.H[0] // ...........*........ + // sub v28.8H, v22.8H, v12.8H // ............*....... + // add v11.8H, v22.8H, v12.8H // .............*...... + // str q14, [x1, #-32] // .................*.. + // sqrdmulh v0.8H, v28.8H, v13.H[1] // ..............*..... + // str q11, [x1, #-48] // ...............*.... + // mul v10.8H, v28.8H, v13.H[0] // ................*... + // mls v10.8H, v0.8H, v7.H[0] // ..................*. + // str q10, [x1, #-16] // ...................* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q19, [x0, #448] // *......... + ldr q23, [x0, #384] // .*........ + // gap // .......... + ldr q2, [x0, #128] // ..*....... + ldr q21, [x0, #192] // .....*.... + // gap // .......... + ldr q24, [x0, #320] // .........* + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + sub v27.8H, v23.8H, v19.8H // ...*...... + // gap // .......... + // gap // .......... + add v15.8H, v23.8H, v19.8H // ....*..... + // gap // .......... + // gap // .......... + sub v10.8H, v2.8H, v21.8H // ......*... + // gap // .......... + // gap // .......... + sqrdmulh v13.8H, v27.8H, v1.H[5] // .......*.. + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + // gap // .......... + mul v27.8H, v27.8H, v1.H[4] // ........*. + // gap // .......... + // gap // .......... + + // original source code + // ldr q26, [x0, #448] // *......... + // ldr q14, [x0, #384] // .*........ + // ldr q2, [x0, #128] // ..*....... + // sub v18.8H, v14.8H, v26.8H // .....*.... + // add v15.8H, v14.8H, v26.8H // ......*... + // ldr q21, [x0, #192] // ...*...... + // sub v10.8H, v2.8H, v21.8H // .......*.. + // sqrdmulh v13.8H, v18.8H, v1.H[5] // ........*. + // mul v27.8H, v18.8H, v1.H[4] // .........* + // ldr q24, [x0, #320] // ....*..... + + sub count, count, #1 +layer123_start: + ldr q28, [x0, #64] // .*...................................................................................... + sqrdmulh v9.8H, v10.8H, v1.H[1] // ................*....................................................................... + ldr q23, [x0, #0] // *....................................................................................... + ldr q26, [x0, #464] // .......e................................................................................ + ldr q14, [x0, #400] // ......e................................................................................. + // gap // ........................................................................................ + mul v3.8H, v10.8H, v1.H[0] // ...............*........................................................................ + add v8.8H, v2.8H, v21.8H // ..............*......................................................................... + ldr q22, [x0, #256] // ....*................................................................................... + ldr q2, [x0, #144] // ..e..................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v25.8H, v23.8H, v28.8H // .........*.............................................................................. + mls v27.8H, v13.8H, v7.H[0] // ...........................*............................................................ + // gap // ........................................................................................ + sub v20.8H, v23.8H, v28.8H // ........*............................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v3.8H, v9.8H, v7.H[0] // .................*...................................................................... + add v23.8H, v22.8H, v24.8H // ...................*.................................................................... + // gap // ........................................................................................ + sub v28.8H, v22.8H, v24.8H // ..................*..................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v17.8H, v20.8H, v0.H[7] // ...........*............................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v19.8H, v23.8H, v15.8H // ......................................*................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v12.8H, v25.8H, v8.8H // ............................*........................................................... + mul v21.8H, v20.8H, v0.H[6] // ..........*............................................................................. + // gap // ........................................................................................ + sub v18.8H, v14.8H, v26.8H // .......................e................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v31.8H, v23.8H, v15.8H // .......................................*................................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v28.8H, v1.H[3] // .....................*.................................................................. + add v15.8H, v14.8H, v26.8H // ........................e............................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v21.8H, v17.8H, v7.H[0] // ............*........................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v14.8H, v28.8H, v1.H[2] // ....................*................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v14.8H, v23.8H, v7.H[0] // ......................*................................................................. + add v23.8H, v25.8H, v8.8H // .............................*.......................................................... + // gap // ........................................................................................ + sub v28.8H, v21.8H, v3.8H // .................................*...................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v3.8H, v21.8H, v3.8H // ..................................*..................................................... + mul v21.8H, v19.8H, v0.H[4] // ........................................*............................................... + // gap // ........................................................................................ + add v24.8H, v23.8H, v31.8H // .................................................*...................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v13.8H, v19.8H, v0.H[5] // .........................................*.............................................. + sub v23.8H, v23.8H, v31.8H // ................................................*....................................... + // gap // ........................................................................................ + add v22.8H, v14.8H, v27.8H // ............................................*........................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v6.8H, v12.8H, v0.H[2] // ..............................*......................................................... + sub v17.8H, v14.8H, v27.8H // ...........................................*............................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v20.8H, v23.8H, v0.H[0] // ..................................................*..................................... + sub v10.8H, v3.8H, v22.8H // .....................................................*.................................. + // gap // ........................................................................................ + add v9.8H, v3.8H, v22.8H // ......................................................*................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v14.8H, v17.8H, v0.H[5] // ..............................................*......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v23.8H, v0.H[1] // ...................................................*.................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v26.8H, v24.8H, v29.8H // ........................................................................*............... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v31.8H, v28.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v20.8H, v19.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v4.8H, v28.8H, v0.H[2] // ...................................*.................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v25.8H, v12.8H, v0.H[3] // ...............................*........................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q20, [x0, #256] // ....................................................................*................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v21.8H, v13.8H, v7.H[0] // ..........................................*............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v22.8H, v17.8H, v0.H[4] // .............................................*.......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v6.8H, v25.8H, v7.H[0] // ................................*....................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v24.8H, v30.8H // .........................................................................*.............. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v11.8H, v10.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v8.8H, v6.8H, v21.8H // ..........................................................*............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + add v25.8H, v6.8H, v21.8H // ...........................................................*............................ + ldr q21, [x0, #208] // ...e.................................................................................... + mul v24.8H, v9.8H, v29.8H // ...........................................................................*............ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v3.8H, v8.8H, v0.H[1] // .............................................................*.......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v20.8H, v8.8H, v0.H[0] // ............................................................*........................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v4.8H, v31.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v14.8H, v7.H[0] // ...............................................*........................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v26.8H, v19.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v10.8H, v0.H[1] // ........................................................*............................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v23.8H, v4.8H, v22.8H // ...............................................................*........................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v10.8H, v2.8H, v21.8H // .............e.......................................................................... + mls v20.8H, v3.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + add v14.8H, v4.8H, v22.8H // ................................................................*....................... + str q26, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v28.8H, v25.8H, v29.8H // ..............................................................................*......... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v11.8H, v19.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q20, [x0, #368] // ......................................................................*................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v12.8H, v25.8H, v30.8H // ...............................................................................*........ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v22.8H, v14.8H, v29.8H // .................................................................................*...... + str q11, [x0, #304] // .....................................................................*.................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v19.8H, v23.8H, v0.H[0] // .................................................................*...................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v6.8H, v23.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v11.8H, v9.8H, v30.8H // ............................................................................*........... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v8.8H, v14.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.8H, v6.8H, v7.H[0] // ...................................................................*.................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v24.8H, v11.8H, v7.H[0] // .............................................................................*.......... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v28.8H, v12.8H, v7.H[0] // ................................................................................*....... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q19, [x0, #432] // .......................................................................*................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v8.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q24, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v13.8H, v18.8H, v1.H[5] // ..........................e............................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q28, [x0, #112] // ......................................................................................*. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v27.8H, v18.8H, v1.H[4] // .........................e.............................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q22, [x0, #176] // .......................................................................................* + // gap // ........................................................................................ + ldr q24, [x0, #320] // .....e.................................................................................. + + // original source code + // ldr q8, [x0, #0] // .....................................................................................|.*.................................................................................... + // ldr q9, [x0, #(1*(512/8))] // .....................................................................................*...................................................................................... + // ldr q10, [x0, #(2*(512/8))] // .....e...............................................................................|.......e.............................................................................. + // ldr q11, [x0, #(3*(512/8))] // ...................................................e.................................|.....................................................e................................ + // ldr q12, [x0, #(4*(512/8))] // ....*................................................................................|......*............................................................................... + // ldr q13, [x0, #(5*(512/8))] // ....................................................................................e|...................................................................................... + // ldr q14, [x0, #(6*(512/8))] // .e...................................................................................|...e.................................................................................. + // ldr q15, [x0, #(7*(512/8))] // e....................................................................................|..e................................................................................... + // sub v24.8h, v8.8h, v9.8h // ........*............................................................................|..........*........................................................................... + // add v8.8h, v8.8h, v9.8h // ......*..............................................................................|........*............................................................................. + // mul v9.8h, v24.8h, v0.h[6] // ...............*.....................................................................|.................*.................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ............*........................................................................|..............*....................................................................... + // mls v9.8h, v24.8h, v7.h[0] // ....................*................................................................|......................*............................................................... + // sub v24.8h, v10.8h, v11.8h // ............................................................e........................|..............................................................e....................... + // add v10.8h, v10.8h, v11.8h // ...*.................................................................................|.....*................................................................................ + // mul v11.8h, v24.8h, v1.h[0] // ..*..................................................................................|....*................................................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[1] // .....................................................................................|*..................................................................................... + // mls v11.8h, v24.8h, v7.h[0] // .........*...........................................................................|...........*.......................................................................... + // sub v24.8h, v12.8h, v13.8h // ...........*.........................................................................|.............*........................................................................ + // add v12.8h, v12.8h, v13.8h // ..........*..........................................................................|............*......................................................................... + // mul v13.8h, v24.8h, v1.h[2] // .....................*...............................................................|.......................*.............................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ..................*..................................................................|....................*................................................................. + // mls v13.8h, v24.8h, v7.h[0] // ......................*..............................................................|........................*............................................................. + // sub v24.8h, v14.8h, v15.8h // ................e....................................................................|..................e................................................................... + // add v14.8h, v14.8h, v15.8h // ...................e.................................................................|.....................e................................................................ + // mul v15.8h, v24.8h, v1.h[4] // ..................................................................................e..|....................................................................................e. + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ................................................................................e....|..................................................................................e... + // mls v15.8h, v24.8h, v7.h[0] // .......*.............................................................................|.........*............................................................................ + // sub v24.8h, v8.8h, v10.8h // ..............*......................................................................|................*..................................................................... + // add v8.8h, v8.8h, v10.8h // .......................*.............................................................|.........................*............................................................ + // mul v10.8h, v24.8h, v0.h[2] // ...............................*.....................................................|.................................*.................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................................*..........................................|............................................*......................................... + // mls v10.8h, v24.8h, v7.h[0] // ..............................................*......................................|................................................*..................................... + // sub v24.8h, v9.8h, v11.8h // ........................*............................................................|..........................*........................................................... + // add v9.8h, v9.8h, v11.8h // .........................*...........................................................|...........................*.......................................................... + // mul v11.8h, v24.8h, v0.h[2] // .........................................*...........................................|...........................................*.......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .......................................*.............................................|.........................................*............................................ + // mls v11.8h, v24.8h, v7.h[0] // .......................................................*.............................|.........................................................*............................ + // sub v24.8h, v12.8h, v14.8h // .............*.......................................................................|...............*...................................................................... + // add v12.8h, v12.8h, v14.8h // .................*...................................................................|...................*.................................................................. + // mul v14.8h, v24.8h, v0.h[4] // ..........................*..........................................................|............................*......................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ............................*........................................................|..............................*....................................................... + // mls v14.8h, v24.8h, v7.h[0] // ............................................*........................................|..............................................*....................................... + // sub v24.8h, v13.8h, v15.8h // ................................*....................................................|..................................*................................................... + // add v13.8h, v13.8h, v15.8h // ..............................*......................................................|................................*..................................................... + // mul v15.8h, v24.8h, v0.h[4] // .............................................*.......................................|...............................................*...................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ....................................*................................................|......................................*............................................... + // mls v15.8h, v24.8h, v7.h[0] // ........................................................*............................|..........................................................*........................... + // sub v24.8h, v8.8h, v12.8h // .............................*.......................................................|...............................*...................................................... + // add v8.8h, v8.8h, v12.8h // ...........................*.........................................................|.............................*........................................................ + // mul v12.8h, v24.8h, v0.h[0] // .................................*...................................................|...................................*.................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....................................*...............................................|.......................................*.............................................. + // mls v12.8h, v24.8h, v7.h[0] // ........................................*............................................|..........................................*........................................... + // sub v24.8h, v9.8h, v13.8h // ..................................*..................................................|....................................*................................................. + // add v9.8h, v9.8h, v13.8h // ...................................*.................................................|.....................................*................................................ + // mul v13.8h, v24.8h, v0.h[0] // ................................................*....................................|..................................................*................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................................................*..........................|............................................................*......................... + // mls v13.8h, v24.8h, v7.h[0] // .................................................................*...................|...................................................................*.................. + // sub v24.8h, v10.8h, v14.8h // .................................................*...................................|...................................................*.................................. + // add v10.8h, v10.8h, v14.8h // ..................................................*..................................|....................................................*................................. + // mul v14.8h, v24.8h, v0.h[0] // ......................................................*..............................|........................................................*............................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....................................................*...............................|.......................................................*.............................. + // mls v14.8h, v24.8h, v7.h[0] // .............................................................*.......................|...............................................................*...................... + // sub v24.8h, v11.8h, v15.8h // ...........................................................*.........................|.............................................................*........................ + // add v11.8h, v11.8h, v15.8h // ..............................................................*......................|................................................................*..................... + // mul v15.8h, v24.8h, v0.h[0] // ......................................................................*..............|........................................................................*............. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................................................................*.............|.........................................................................*............ + // mls v15.8h, v24.8h, v7.h[0] // ..........................................................................*..........|............................................................................*......... + // str q12, [x0, #(4*(512/8))] // ...........................................*.........................................|.............................................*........................................ + // str q13, [x0, #(5*(512/8))] // .....................................................................*...............|.......................................................................*.............. + // str q14, [x0, #(6*(512/8))] // ..................................................................*..................|....................................................................*................. + // str q15, [x0, #(7*(512/8))] // .............................................................................*.......|...............................................................................*...... + // mul v12.8h, v8.8h, v29.8h // ......................................*..............................................|........................................*............................................. + // sqrdmulh v8.8h, v8.8h, v30.8h // ...............................................*.....................................|.................................................*.................................... + // mls v12.8h, v8.8h, v7.h[0] // .........................................................*...........................|...........................................................*.......................... + // mul v13.8h, v9.8h, v29.8h // ....................................................*................................|......................................................*............................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ........................................................................*............|..........................................................................*........... + // mls v13.8h, v9.8h, v7.h[0] // ...........................................................................*.........|.............................................................................*........ + // mul v14.8h, v10.8h, v29.8h // ................................................................*....................|..................................................................*................... + // sqrdmulh v10.8h, v10.8h, v30.8h // ...................................................................*.................|.....................................................................*................ + // mls v14.8h, v10.8h, v7.h[0] // ............................................................................*........|..............................................................................*....... + // mul v15.8h, v11.8h, v29.8h // ....................................................................*................|......................................................................*............... + // sqrdmulh v11.8h, v11.8h, v30.8h // .........................................................................*...........|...........................................................................*.......... + // mls v15.8h, v11.8h, v7.h[0] // ..............................................................................*......|................................................................................*..... + // str q12, [x0], #(16) // ...............................................................*.....................|.................................................................*.................... + // str q13, [x0, #(-16 + 1*(512/8))] // ...............................................................................*.....|.................................................................................*.... + // str q14, [x0, #(-16 + 2*(512/8))] // .................................................................................*...|...................................................................................*.. + // str q15, [x0, #(-16 + 3*(512/8))] // ...................................................................................*.|.....................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + add v19.8H, v2.8H, v21.8H // ....*......................................................................... + ldr q23, [x0, #256] // .....*........................................................................ + sqrdmulh v22.8H, v10.8H, v1.H[1] // .*............................................................................ + ldr q28, [x0, #64] // *............................................................................. + ldr q3, [x0, #0] // ..*........................................................................... + // gap // .............................................................................. + mul v26.8H, v10.8H, v1.H[0] // ...*.......................................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v27.8H, v13.8H, v7.H[0] // .......*...................................................................... + add v20.8H, v23.8H, v24.8H // ..........*................................................................... + // gap // .............................................................................. + sub v23.8H, v23.8H, v24.8H // ...........*.................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v26.8H, v22.8H, v7.H[0] // .........*.................................................................... + add v22.8H, v3.8H, v28.8H // ......*....................................................................... + // gap // .............................................................................. + sub v24.8H, v20.8H, v15.8H // .............*................................................................ + // gap // .............................................................................. + // gap // .............................................................................. + add v20.8H, v20.8H, v15.8H // ................*............................................................. + sqrdmulh v11.8H, v23.8H, v1.H[3] // .................*............................................................ + // gap // .............................................................................. + sub v14.8H, v22.8H, v19.8H // ..............*............................................................... + // gap // .............................................................................. + // gap // .............................................................................. + add v19.8H, v22.8H, v19.8H // .....................*........................................................ + mul v23.8H, v23.8H, v1.H[2] // ...................*.......................................................... + // gap // .............................................................................. + sub v22.8H, v3.8H, v28.8H // ........*..................................................................... + // gap // .............................................................................. + // gap // .............................................................................. + mul v28.8H, v24.8H, v0.H[4] // ........................*..................................................... + // gap // .............................................................................. + // gap // .............................................................................. + add v3.8H, v19.8H, v20.8H // .........................*.................................................... + // gap // .............................................................................. + // gap // .............................................................................. + sub v19.8H, v19.8H, v20.8H // ...........................*.................................................. + mls v23.8H, v11.8H, v7.H[0] // ....................*......................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v20.8H, v22.8H, v0.H[7] // ............*................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v22.8H, v22.8H, v0.H[6] // ...............*.............................................................. + // gap // .............................................................................. + // gap // .............................................................................. + add v11.8H, v23.8H, v27.8H // ............................*................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v23.8H, v23.8H, v27.8H // ..............................*............................................... + sqrdmulh v27.8H, v24.8H, v0.H[5] // ..........................*................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v22.8H, v20.8H, v7.H[0] // ..................*........................................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v24.8H, v14.8H, v0.H[2] // .............................*................................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v20.8H, v19.8H, v0.H[0] // ...............................*.............................................. + // gap // .............................................................................. + // gap // .............................................................................. + sub v25.8H, v22.8H, v26.8H // ......................*....................................................... + // gap // .............................................................................. + // gap // .............................................................................. + add v22.8H, v22.8H, v26.8H // .......................*...................................................... + sqrdmulh v26.8H, v14.8H, v0.H[3] // ........................................*..................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v14.8H, v23.8H, v0.H[5] // ..................................*........................................... + // gap // .............................................................................. + // gap // .............................................................................. + sub v5.8H, v22.8H, v11.8H // ................................*............................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v19.8H, v19.8H, v0.H[1] // ...................................*.......................................... + add v22.8H, v22.8H, v11.8H // .................................*............................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v11.8H, v3.8H, v29.8H // ....................................*......................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v10.8H, v25.8H, v0.H[3] // .....................................*........................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v28.8H, v27.8H, v7.H[0] // ..........................................*................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v27.8H, v3.8H, v30.8H // .............................................*................................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v20.8H, v19.8H, v7.H[0] // ......................................*....................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v19.8H, v25.8H, v0.H[2] // .......................................*...................................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v23.8H, v23.8H, v0.H[4] // ...........................................*.................................. + // gap // .............................................................................. + // gap // .............................................................................. + str q20, [x0, #256] // .........................................*.................................... + // gap // .............................................................................. + // gap // .............................................................................. + mls v24.8H, v26.8H, v7.H[0] // ............................................*................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v3.8H, v5.8H, v0.H[0] // ..............................................*............................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v26.8H, v22.8H, v29.8H // .................................................*............................ + // gap // .............................................................................. + // gap // .............................................................................. + sub v20.8H, v24.8H, v28.8H // ...............................................*.............................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v19.8H, v10.8H, v7.H[0] // ....................................................*......................... + add v28.8H, v24.8H, v28.8H // ................................................*............................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v24.8H, v20.8H, v0.H[1] // ..................................................*........................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v20.8H, v20.8H, v0.H[0] // ...................................................*.......................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v23.8H, v14.8H, v7.H[0] // .....................................................*........................ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v11.8H, v27.8H, v7.H[0] // ......................................................*....................... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v27.8H, v5.8H, v0.H[1] // .......................................................*...................... + // gap // .............................................................................. + // gap // .............................................................................. + sub v14.8H, v19.8H, v23.8H // ........................................................*..................... + // gap // .............................................................................. + // gap // .............................................................................. + add v19.8H, v19.8H, v23.8H // ..........................................................*................... + mls v20.8H, v24.8H, v7.H[0] // .........................................................*.................... + // gap // .............................................................................. + str q11, [x0], #(16) // ...........................................................*.................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v23.8H, v28.8H, v29.8H // ............................................................*................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v3.8H, v27.8H, v7.H[0] // .............................................................*................ + // gap // .............................................................................. + // gap // .............................................................................. + str q20, [x0, #368] // ..............................................................*............... + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v28.8H, v28.8H, v30.8H // ...............................................................*.............. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mul v27.8H, v19.8H, v29.8H // ................................................................*............. + // gap // .............................................................................. + // gap // .............................................................................. + str q3, [x0, #304] // .................................................................*............ + // gap // .............................................................................. + // gap // .............................................................................. + mul v24.8H, v14.8H, v0.H[0] // ..................................................................*........... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v3.8H, v14.8H, v0.H[1] // ...................................................................*.......... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v22.8H, v22.8H, v30.8H // ....................................................................*......... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + sqrdmulh v19.8H, v19.8H, v30.8H // .....................................................................*........ + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v24.8H, v3.8H, v7.H[0] // ......................................................................*....... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v26.8H, v22.8H, v7.H[0] // .......................................................................*...... + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + mls v23.8H, v28.8H, v7.H[0] // ........................................................................*..... + // gap // .............................................................................. + // gap // .............................................................................. + str q24, [x0, #432] // .........................................................................*.... + // gap // .............................................................................. + // gap // .............................................................................. + mls v27.8H, v19.8H, v7.H[0] // ..........................................................................*... + // gap // .............................................................................. + // gap // .............................................................................. + str q26, [x0, #48] // ...........................................................................*.. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + str q23, [x0, #112] // ............................................................................*. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + // gap // .............................................................................. + str q27, [x0, #176] // .............................................................................* + // gap // .............................................................................. + // gap // .............................................................................. + + // original source code + // ldr q28, [x0, #64] // ...*.......................................................................... + // sqrdmulh v9.8H, v10.8H, v1.H[1] // ..*........................................................................... + // ldr q23, [x0, #0] // ....*......................................................................... + // mul v3.8H, v10.8H, v1.H[0] // .....*........................................................................ + // add v8.8H, v2.8H, v21.8H // *............................................................................. + // ldr q22, [x0, #256] // .*............................................................................ + // add v25.8H, v23.8H, v28.8H // ..........*................................................................... + // mls v27.8H, v13.8H, v7.H[0] // ......*....................................................................... + // sub v20.8H, v23.8H, v28.8H // .................*............................................................ + // mls v3.8H, v9.8H, v7.H[0] // .........*.................................................................... + // add v23.8H, v22.8H, v24.8H // .......*...................................................................... + // sub v28.8H, v22.8H, v24.8H // ........*..................................................................... + // sqrdmulh v17.8H, v20.8H, v0.H[7] // ......................*....................................................... + // sub v19.8H, v23.8H, v15.8H // ...........*.................................................................. + // sub v12.8H, v25.8H, v8.8H // ..............*............................................................... + // mul v21.8H, v20.8H, v0.H[6] // .......................*...................................................... + // add v31.8H, v23.8H, v15.8H // ............*................................................................. + // sqrdmulh v23.8H, v28.8H, v1.H[3] // .............*................................................................ + // mls v21.8H, v17.8H, v7.H[0] // ...........................*.................................................. + // mul v14.8H, v28.8H, v1.H[2] // ................*............................................................. + // mls v14.8H, v23.8H, v7.H[0] // .....................*........................................................ + // add v23.8H, v25.8H, v8.8H // ...............*.............................................................. + // sub v28.8H, v21.8H, v3.8H // ..............................*............................................... + // add v3.8H, v21.8H, v3.8H // ...............................*.............................................. + // mul v21.8H, v19.8H, v0.H[4] // ..................*........................................................... + // add v24.8H, v23.8H, v31.8H // ...................*.......................................................... + // sqrdmulh v13.8H, v19.8H, v0.H[5] // ..........................*................................................... + // sub v23.8H, v23.8H, v31.8H // ....................*......................................................... + // add v22.8H, v14.8H, v27.8H // ........................*..................................................... + // mul v6.8H, v12.8H, v0.H[2] // ............................*................................................. + // sub v17.8H, v14.8H, v27.8H // .........................*.................................................... + // mul v20.8H, v23.8H, v0.H[0] // .............................*................................................ + // sub v10.8H, v3.8H, v22.8H // ..................................*........................................... + // add v9.8H, v3.8H, v22.8H // ....................................*......................................... + // sqrdmulh v14.8H, v17.8H, v0.H[5] // .................................*............................................ + // sqrdmulh v19.8H, v23.8H, v0.H[1] // ...................................*.......................................... + // mul v26.8H, v24.8H, v29.8H // .....................................*........................................ + // sqrdmulh v31.8H, v28.8H, v0.H[3] // ......................................*....................................... + // mls v20.8H, v19.8H, v7.H[0] // .........................................*.................................... + // mul v4.8H, v28.8H, v0.H[2] // ..........................................*................................... + // sqrdmulh v25.8H, v12.8H, v0.H[3] // ................................*............................................. + // str q20, [x0, #256] // ............................................*................................. + // mls v21.8H, v13.8H, v7.H[0] // .......................................*...................................... + // mul v22.8H, v17.8H, v0.H[4] // ...........................................*.................................. + // mls v6.8H, v25.8H, v7.H[0] // .............................................*................................ + // sqrdmulh v19.8H, v24.8H, v30.8H // ........................................*..................................... + // mul v11.8H, v10.8H, v0.H[0] // ..............................................*............................... + // sub v8.8H, v6.8H, v21.8H // ................................................*............................. + // add v25.8H, v6.8H, v21.8H // ..................................................*........................... + // mul v24.8H, v9.8H, v29.8H // ...............................................*.............................. + // sqrdmulh v3.8H, v8.8H, v0.H[1] // ...................................................*.......................... + // mul v20.8H, v8.8H, v0.H[0] // ....................................................*......................... + // mls v4.8H, v31.8H, v7.H[0] // .................................................*............................ + // mls v22.8H, v14.8H, v7.H[0] // .....................................................*........................ + // mls v26.8H, v19.8H, v7.H[0] // ......................................................*....................... + // sqrdmulh v19.8H, v10.8H, v0.H[1] // .......................................................*...................... + // sub v23.8H, v4.8H, v22.8H // ........................................................*..................... + // mls v20.8H, v3.8H, v7.H[0] // ..........................................................*................... + // add v14.8H, v4.8H, v22.8H // .........................................................*.................... + // str q26, [x0], #(16) // ...........................................................*.................. + // mul v28.8H, v25.8H, v29.8H // ............................................................*................. + // mls v11.8H, v19.8H, v7.H[0] // .............................................................*................ + // str q20, [x0, #368] // ..............................................................*............... + // sqrdmulh v12.8H, v25.8H, v30.8H // ...............................................................*.............. + // mul v22.8H, v14.8H, v29.8H // ................................................................*............. + // str q11, [x0, #304] // .................................................................*............ + // mul v19.8H, v23.8H, v0.H[0] // ..................................................................*........... + // sqrdmulh v6.8H, v23.8H, v0.H[1] // ...................................................................*.......... + // sqrdmulh v11.8H, v9.8H, v30.8H // ....................................................................*......... + // sqrdmulh v8.8H, v14.8H, v30.8H // .....................................................................*........ + // mls v19.8H, v6.8H, v7.H[0] // ......................................................................*....... + // mls v24.8H, v11.8H, v7.H[0] // .......................................................................*...... + // mls v28.8H, v12.8H, v7.H[0] // ........................................................................*..... + // str q19, [x0, #432] // .........................................................................*.... + // mls v22.8H, v8.8H, v7.H[0] // ..........................................................................*... + // str q24, [x0, #48] // ...........................................................................*.. + // str q28, [x0, #112] // ............................................................................*. + // str q22, [x0, #176] // .............................................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s new file mode 100644 index 00000000..ea0a6c4f --- /dev/null +++ b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_firestorm.s @@ -0,0 +1,1970 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_manual_ld4_opt_m1_firestorm + .global _intt_kyber_123_4567_manual_ld4_opt_m1_firestorm + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: +_intt_kyber_123_4567_manual_ld4_opt_m1_firestorm: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q13, [x3], #16 // ...................................*.............. + ldr q28, [x4, #48] // ...*.............................................. + ldr q3, [x4, #16] // ..*............................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // *................................................. + ldr q12, [x4, #80] // ....*............................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q22, [x4, #32] // .....*............................................ + ldr q26, [x4, #64] // .*................................................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v8.8H, v14.8H, v15.8H // .........*........................................ + add v0.8H, v14.8H, v15.8H // ..........*....................................... + sub v23.8H, v16.8H, v17.8H // .......*.......................................... + add v27.8H, v16.8H, v17.8H // ........*......................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q24, [x4], #(6*16) // ......*........................................... + sqrdmulh v19.8H, v8.8H, v28.8H // .............*.................................... + sqrdmulh v10.8H, v23.8H, v12.8H // ...........*...................................... + mul v25.8H, v23.8H, v26.8H // ............*..................................... + mul v21.8H, v8.8H, v22.8H // ..............*................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v6.8H, v0.8H, v27.8H // ................*................................. + sub v9.8H, v0.8H, v27.8H // ...............*.................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v22.8H, v9.8H, v24.8H // .................*................................ + sqrdmulh v30.8H, v9.8H, v3.8H // ..................*............................... + mls v25.8H, v10.8H, v7.H[0] // ....................*............................. + mls v21.8H, v19.8H, v7.H[0] // ...................*.............................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v28.8H, v21.8H, v25.8H // ......................*........................... + sub v17.8H, v21.8H, v25.8H // .....................*............................ + mls v22.8H, v30.8H, v7.H[0] // .......................*.......................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn2 v23.4S, v6.4S, v28.4S // ........................*......................... + trn1 v0.4S, v6.4S, v28.4S // ...........................*...................... + sqrdmulh v19.8H, v17.8H, v3.8H // ..........................*....................... + mul v29.8H, v17.8H, v24.8H // .........................*........................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v29.8H, v19.8H, v7.H[0] // ............................*..................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v8.4S, v22.4S, v29.4S // ..............................*................... + trn2 v31.4S, v22.4S, v29.4S // .............................*.................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn2 v26.2D, v0.2D, v8.2D // .................................*................ + trn1 v8.2D, v0.2D, v8.2D // ..................................*............... + trn2 v5.2D, v23.2D, v31.2D // ...............................*.................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + trn1 v22.2D, v23.2D, v31.2D // ................................*................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v19.8H, v8.8H, v22.8H // .....................................*............ + add v28.8H, v26.8H, v5.8H // ....................................*............. + sub v23.8H, v8.8H, v22.8H // ........................................*......... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v31.8H, v23.8H, v13.H[2] // ...........................................*...... + sqrdmulh v10.8H, v23.8H, v13.H[3] // ..............................................*... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqdmulh v22.8H, v28.8H, v7.H[1] // ......................................*........... + sqdmulh v23.8H, v19.8H, v7.H[1] // .......................................*.......... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + srshr v21.8H, v23.8H, #11 // .........................................*........ + srshr v24.8H, v22.8H, #11 // ..........................................*....... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v19.8H, v21.8H, v7.H[0] // ............................................*..... + mls v28.8H, v24.8H, v7.H[0] // .............................................*.... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v0.8H, v19.8H, v28.8H // ................................................*. + add v9.8H, v19.8H, v28.8H // ...............................................*.. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + str q9, [x1], #(64) // .................................................* + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + + // original source code + // ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // ...*.............................................. + // ldr q28, [x4, #64] // ......*........................................... + // ldr q6, [x4, #16] // ..*............................................... + // ldr q24, [x4, #48] // .*................................................ + // ldr q25, [x4, #80] // ....*............................................. + // ldr q10, [x4, #32] // .....*............................................ + // ldr q2, [x4], #(6*16) // ...........*...................................... + // sub v5.8H, v16.8H, v17.8H // .........*........................................ + // add v21.8H, v16.8H, v17.8H // ..........*....................................... + // sub v27.8H, v14.8H, v15.8H // .......*.......................................... + // add v16.8H, v14.8H, v15.8H // ........*......................................... + // sqrdmulh v23.8H, v5.8H, v25.8H // .............*.................................... + // mul v11.8H, v5.8H, v28.8H // ..............*................................... + // sqrdmulh v20.8H, v27.8H, v24.8H // ............*..................................... + // mul v9.8H, v27.8H, v10.8H // ...............*.................................. + // sub v25.8H, v16.8H, v21.8H // .................*................................ + // add v18.8H, v16.8H, v21.8H // ................*................................. + // mul v29.8H, v25.8H, v2.8H // ..................*............................... + // sqrdmulh v14.8H, v25.8H, v6.8H // ...................*.............................. + // mls v9.8H, v20.8H, v7.H[0] // .....................*............................ + // mls v11.8H, v23.8H, v7.H[0] // ....................*............................. + // sub v30.8H, v9.8H, v11.8H // .......................*.......................... + // add v5.8H, v9.8H, v11.8H // ......................*........................... + // mls v29.8H, v14.8H, v7.H[0] // ........................*......................... + // trn2 v21.4S, v18.4S, v5.4S // .........................*........................ + // mul v12.8H, v30.8H, v2.8H // ............................*..................... + // sqrdmulh v22.8H, v30.8H, v6.8H // ...........................*...................... + // trn1 v28.4S, v18.4S, v5.4S // ..........................*....................... + // mls v12.8H, v22.8H, v7.H[0] // .............................*.................... + // trn2 v22.4S, v29.4S, v12.4S // ...............................*.................. + // trn1 v25.4S, v29.4S, v12.4S // ..............................*................... + // trn2 v5.2D, v21.2D, v22.2D // ..................................*............... + // trn1 v24.2D, v21.2D, v22.2D // ...................................*.............. + // trn2 v26.2D, v28.2D, v25.2D // ................................*................. + // trn1 v25.2D, v28.2D, v25.2D // .................................*................ + // ldr q13, [x3], #16 // *................................................. + // add v20.8H, v26.8H, v5.8H // .....................................*............ + // add v8.8H, v25.8H, v24.8H // ....................................*............. + // sqdmulh v10.8H, v20.8H, v7.H[1] // .........................................*........ + // sqdmulh v11.8H, v8.8H, v7.H[1] // ..........................................*....... + // sub v23.8H, v25.8H, v24.8H // ......................................*........... + // srshr v27.8H, v11.8H, #11 // ...........................................*...... + // srshr v10.8H, v10.8H, #11 // ............................................*..... + // mul v31.8H, v23.8H, v13.H[2] // .......................................*.......... + // mls v8.8H, v27.8H, v7.H[0] // .............................................*.... + // mls v20.8H, v10.8H, v7.H[0] // ..............................................*... + // sqrdmulh v10.8H, v23.8H, v13.H[3] // ........................................*......... + // add v28.8H, v8.8H, v20.8H // ................................................*. + // sub v0.8H, v8.8H, v20.8H // ...............................................*.. + // str q28, [x1], #(64) // .................................................* + + sub count, count, #1 +layer4567_start: + ld4 {v14.4S, v15.4S, v16.4S, v17.4S}, [x1] // e....................................................................... + ldr q28, [x4, #64] // .....e.................................................................. + sqrdmulh v25.8H, v0.8H, v13.H[1] // .............................................................*.......... + mul v19.8H, v0.8H, v13.H[0] // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v31.8H, v10.8H, v7.H[0] // ........................................*............................... + ldr q6, [x4, #16] // ..e..................................................................... + sub v8.8H, v26.8H, v5.8H // .........................................*.............................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q24, [x4, #48] // ....e................................................................... + mls v19.8H, v25.8H, v7.H[0] // ..............................................................*......... + sqdmulh v26.8H, v31.8H, v7.H[1] // .................................................*...................... + ldr q25, [x4, #80] // ......e................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v3.8H, v8.8H, v13.H[4] // ...........................................*............................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v30.8H, v8.8H, v13.H[5] // ............................................*........................... + ldr q10, [x4, #32] // ...e.................................................................... + ldr q2, [x4], #(6*16) // .e...................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v5.8H, v16.8H, v17.8H // ............e........................................................... + add v21.8H, v16.8H, v17.8H // .............e.......................................................... + srshr v1.8H, v26.8H, #11 // ..................................................*..................... + sub v27.8H, v14.8H, v15.8H // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v16.8H, v14.8H, v15.8H // ........e............................................................... + mls v3.8H, v30.8H, v7.H[0] // .............................................*.......................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v23.8H, v5.8H, v25.8H // ...............e........................................................ + mul v11.8H, v5.8H, v28.8H // ..............e......................................................... + sqrdmulh v20.8H, v27.8H, v24.8H // ..........e............................................................. + mul v9.8H, v27.8H, v10.8H // .........e.............................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v31.8H, v1.8H, v7.H[0] // ...................................................*.................... + sub v25.8H, v16.8H, v21.8H // .................e...................................................... + add v18.8H, v16.8H, v21.8H // ..................e..................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v17.8H, v3.8H, v7.H[1] // .......................................................*................ + mul v29.8H, v25.8H, v2.8H // ...................e.................................................... + sqrdmulh v14.8H, v25.8H, v6.8H // ....................e................................................... + mls v9.8H, v20.8H, v7.H[0] // ...........e............................................................ + mls v11.8H, v23.8H, v7.H[0] // ................e....................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v23.8H, v17.8H, #11 // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v30.8H, v9.8H, v11.8H // ......................e................................................. + add v5.8H, v9.8H, v11.8H // .......................e................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v29.8H, v14.8H, v7.H[0] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v21.4S, v18.4S, v5.4S // ............................e........................................... + mul v12.8H, v30.8H, v2.8H // ........................e............................................... + sqrdmulh v22.8H, v30.8H, v6.8H // .........................e.............................................. + mls v3.8H, v23.8H, v7.H[0] // .........................................................*.............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v28.4S, v18.4S, v5.4S // ...........................e............................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v12.8H, v22.8H, v7.H[0] // ..........................e............................................. + add v23.8H, v31.8H, v3.8H // ................................................................*....... + sub v8.8H, v31.8H, v3.8H // ...............................................................*........ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q23, [x1, #-48] // .....................................................................*.. + sqrdmulh v23.8H, v8.8H, v13.H[1] // ..................................................................*..... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v22.4S, v29.4S, v12.4S // ..............................e......................................... + trn1 v25.4S, v29.4S, v12.4S // .............................e.......................................... + str q19, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v5.2D, v21.2D, v22.2D // ................................e....................................... + trn1 v24.2D, v21.2D, v22.2D // ..................................e..................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v26.2D, v28.2D, v25.2D // ...............................e........................................ + trn1 v25.2D, v28.2D, v25.2D // .................................e...................................... + mul v22.8H, v8.8H, v13.H[0] // .................................................................*...... + ldr q13, [x3], #16 // ...................................e.................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v20.8H, v26.8H, v5.8H // ..........................................e............................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v8.8H, v25.8H, v24.8H // .....................................e.................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v10.8H, v20.8H, v7.H[1] // ....................................................e................... + sqdmulh v11.8H, v8.8H, v7.H[1] // ..............................................e......................... + mls v22.8H, v23.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v23.8H, v25.8H, v24.8H // ....................................e................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v27.8H, v11.8H, #11 // ...............................................e........................ + srshr v10.8H, v10.8H, #11 // .....................................................e.................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q22, [x1, #-16] // .......................................................................* + mul v31.8H, v23.8H, v13.H[2] // ......................................e................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v8.8H, v27.8H, v7.H[0] // ................................................e....................... + mls v20.8H, v10.8H, v7.H[0] // ......................................................e................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v10.8H, v23.8H, v13.H[3] // .......................................e................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v28.8H, v8.8H, v20.8H // ...........................................................e............ + sub v0.8H, v8.8H, v20.8H // ..........................................................e............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q28, [x1], #(64) // ....................................................................e... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................e................................................................ + // ldr q0, [x4], #(6*16) // ..............e.........................................................|.............e.................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // .....e..................................................................|....e........................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // .............e..........................................................|............e................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // .......e................................................................|......e......................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // .e......................................................................|e............................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // ..........e.............................................................|.........e...................................................... + // sub v24.8h, v8.8h, v9.8h // ..................e.....................................................|.................e.............................................. + // add v8.8h, v8.8h, v9.8h // ...................e....................................................|..................e............................................. + // mul v9.8h, v24.8h, v1.8h // ........................e...............................................|.......................e........................................ + // sqrdmulh v24.8h, v24.8h, v5.8h // .......................e................................................|......................e......................................... + // mls v9.8h, v24.8h, v7.h[0] // ...............................e........................................|..............................e................................. + // sub v24.8h, v10.8h, v11.8h // ...............e........................................................|..............e................................................. + // add v10.8h, v10.8h, v11.8h // ................e.......................................................|...............e................................................ + // mul v11.8h, v24.8h, v2.8h // ......................e.................................................|.....................e.......................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // .....................e..................................................|....................e........................................... + // mls v11.8h, v24.8h, v7.h[0] // ................................e.......................................|...............................e................................ + // sub v24.8h, v8.8h, v10.8h // ..........................e.............................................|.........................e...................................... + // add v8.8h, v8.8h, v10.8h // ...........................e............................................|..........................e..................................... + // mul v10.8h, v24.8h, v0.8h // .............................e..........................................|............................e................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ..............................e.........................................|.............................e.................................. + // mls v10.8h, v24.8h, v7.h[0] // ....................................e...................................|...................................e............................ + // sub v24.8h, v9.8h, v11.8h // ..................................e.....................................|.................................e.............................. + // add v9.8h, v9.8h, v11.8h // ...................................e....................................|..................................e............................. + // mul v11.8h, v24.8h, v0.8h // ......................................e.................................|.....................................e.......................... + // sqrdmulh v24.8h, v24.8h, v4.8h // .......................................e................................|......................................e......................... + // mls v11.8h, v24.8h, v7.h[0] // ..........................................e.............................|.........................................e...................... + // trn1 v25.4s, v8.4s, v9.4s // .........................................e..............................|........................................e....................... + // trn2 v26.4s, v8.4s, v9.4s // .....................................e..................................|....................................e........................... + // trn1 v27.4s, v10.4s, v11.4s // ................................................e.......................|...............................................e................ + // trn2 v28.4s, v10.4s, v11.4s // ...............................................e........................|..............................................e................. + // trn2 v10.2d, v25.2d, v27.2d // ....................................................e...................|...................................................e............ + // trn2 v11.2d, v26.2d, v28.2d // ..................................................e.....................|.................................................e.............. + // trn1 v8.2d, v25.2d, v27.2d // .....................................................e..................|....................................................e........... + // trn1 v9.2d, v26.2d, v28.2d // ...................................................e....................|..................................................e............. + // ldr q0, [x3], #16 // .......................................................e................|......................................................e......... + // sub v24.8h, v8.8h, v9.8h // .............................................................e..........|............................................................e... + // add v8.8h, v8.8h, v9.8h // .........................................................e..............|........................................................e....... + // mul v9.8h, v24.8h, v0.h[2] // .................................................................e......|................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................................................................e...|................................................................ + // mls v9.8h, v24.8h, v7.h[0] // ....*...................................................................|...*............................................................ + // sub v24.8h, v10.8h, v11.8h // ......*.................................................................|.....*.......................................................... + // add v10.8h, v10.8h, v11.8h // ........................................................e...............|.......................................................e........ + // mul v11.8h, v24.8h, v0.h[4] // ...........*............................................................|..........*..................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ............*...........................................................|...........*.................................................... + // mls v11.8h, v24.8h, v7.h[0] // ....................*...................................................|...................*............................................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........................................................e............|..........................................................e..... + // srshr v25.8h, v25.8h, #11 // ..............................................................e.........|.............................................................e.. + // mls v8.8h, v25.8h, v7.h[0] // ..................................................................e.....|................................................................ + // sqdmulh v25.8h, v9.8h, v7.h[1] // .........*..............................................................|........*....................................................... + // srshr v25.8h, v25.8h, #11 // .................*......................................................|................*............................................... + // mls v9.8h, v25.8h, v7.h[0] // .........................*..............................................|........................*....................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ..........................................................e.............|.........................................................e...... + // srshr v25.8h, v25.8h, #11 // ...............................................................e........|..............................................................e. + // mls v10.8h, v25.8h, v7.h[0] // ...................................................................e....|................................................................ + // sqdmulh v25.8h, v11.8h, v7.h[1] // ............................*...........................................|...........................*.................................... + // srshr v25.8h, v25.8h, #11 // .................................*......................................|................................*............................... + // mls v11.8h, v25.8h, v7.h[0] // ........................................*...............................|.......................................*........................ + // sub v24.8h, v8.8h, v10.8h // ......................................................................e.|................................................................ + // add v8.8h, v8.8h, v10.8h // .....................................................................e..|................................................................ + // mul v10.8h, v24.8h, v0.h[0] // ...*....................................................................|..*............................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*.....................................................................|.*.............................................................. + // mls v10.8h, v24.8h, v7.h[0] // ........*...............................................................|.......*........................................................ + // sub v24.8h, v9.8h, v11.8h // ............................................*...........................|...........................................*.................... + // add v9.8h, v9.8h, v11.8h // ...........................................*............................|..........................................*..................... + // mul v11.8h, v24.8h, v0.h[0] // ......................................................*.................|.....................................................*.......... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................*.........................|.............................................*.................. + // mls v11.8h, v24.8h, v7.h[0] // ............................................................*...........|...........................................................*.... + // str q8, [x1], #(64) // .......................................................................e|................................................................ + // str q9, [x1, #(-64 + 16*1)] // .............................................*..........................|............................................*................... + // str q10, [x1, #(-64 + 16*2)] // .................................................*......................|................................................*............... + // str q11, [x1, #(-64 + 16*3)] // ................................................................*.......|...............................................................* + + sub count, count, #1 + cbnz count, layer4567_start + mls v31.8H, v10.8H, v7.H[0] // ..*................... + sub v4.8H, v26.8H, v5.8H // ...*.................. + mul v9.8H, v0.8H, v13.H[0] // .*.................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqrdmulh v25.8H, v0.8H, v13.H[1] // *..................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mul v28.8H, v4.8H, v13.H[4] // ......*............... + sqrdmulh v16.8H, v4.8H, v13.H[5] // .......*.............. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqdmulh v5.8H, v31.8H, v7.H[1] // .....*................ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v9.8H, v25.8H, v7.H[0] // ....*................. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v28.8H, v16.8H, v7.H[0] // .........*............ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + srshr v21.8H, v5.8H, #11 // ........*............. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q9, [x1, #-32] // ..................*... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqdmulh v4.8H, v28.8H, v7.H[1] // ...........*.......... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v31.8H, v21.8H, v7.H[0] // ..........*........... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + srshr v14.8H, v4.8H, #11 // ............*......... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v28.8H, v14.8H, v7.H[0] // .............*........ + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sub v8.8H, v31.8H, v28.8H // ...............*...... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + add v26.8H, v31.8H, v28.8H // ..............*....... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + sqrdmulh v20.8H, v8.8H, v13.H[1] // .................*.... + mul v13.8H, v8.8H, v13.H[0] // ...................*.. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q26, [x1, #-48] // ................*..... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + mls v13.8H, v20.8H, v7.H[0] // ....................*. + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + str q13, [x1, #-16] // .....................* + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + // gap // ...................... + + // original source code + // sqrdmulh v25.8H, v0.8H, v13.H[1] // ...*.................. + // mul v19.8H, v0.8H, v13.H[0] // ..*................... + // mls v31.8H, v10.8H, v7.H[0] // *..................... + // sub v8.8H, v26.8H, v5.8H // .*.................... + // mls v19.8H, v25.8H, v7.H[0] // .......*.............. + // sqdmulh v26.8H, v31.8H, v7.H[1] // ......*............... + // mul v3.8H, v8.8H, v13.H[4] // ....*................. + // sqrdmulh v30.8H, v8.8H, v13.H[5] // .....*................ + // srshr v1.8H, v26.8H, #11 // .........*............ + // mls v3.8H, v30.8H, v7.H[0] // ........*............. + // mls v31.8H, v1.8H, v7.H[0] // ............*......... + // sqdmulh v17.8H, v3.8H, v7.H[1] // ...........*.......... + // srshr v23.8H, v17.8H, #11 // .............*........ + // mls v3.8H, v23.8H, v7.H[0] // ..............*....... + // add v23.8H, v31.8H, v3.8H // ................*..... + // sub v8.8H, v31.8H, v3.8H // ...............*...... + // str q23, [x1, #-48] // ...................*.. + // sqrdmulh v23.8H, v8.8H, v13.H[1] // .................*.... + // str q19, [x1, #-32] // ..........*........... + // mul v22.8H, v8.8H, v13.H[0] // ..................*... + // mls v22.8H, v23.8H, v7.H[0] // ....................*. + // str q22, [x1, #-16] // .....................* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q19, [x0, #256] // *................................................. + ldr q23, [x0, #192] // .*................................................ + ldr q22, [x0, #128] // ..*............................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q28, [x0, #320] // ...*.............................................. + ldr q27, [x0, #64] // ....*............................................. + ldr q24, [x0, #0] // .....*............................................ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + ldr q3, [x0, #384] // ......*........................................... + ldr q26, [x0, #448] // .......*.......................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v20.8H, v22.8H, v23.8H // ........*......................................... + add v23.8H, v22.8H, v23.8H // .......................*.......................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v22.8H, v19.8H, v28.8H // ............*..................................... + add v19.8H, v19.8H, v28.8H // .........*........................................ + sub v28.8H, v24.8H, v27.8H // ...........*...................................... + add v27.8H, v24.8H, v27.8H // ..........*....................................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v24.8H, v3.8H, v26.8H // .................*................................ + sub v3.8H, v3.8H, v26.8H // .............*.................................... + sqrdmulh v26.8H, v20.8H, v1.H[1] // ..............*................................... + mul v20.8H, v20.8H, v1.H[0] // ................*................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sqrdmulh v11.8H, v22.8H, v1.H[3] // ....................*............................. + mul v22.8H, v22.8H, v1.H[2] // .....................*............................ + sqrdmulh v14.8H, v28.8H, v0.H[7] // ...............*.................................. + mul v25.8H, v28.8H, v0.H[6] // ......................*........................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v28.8H, v19.8H, v24.8H // .........................*........................ + add v19.8H, v19.8H, v24.8H // ........................*......................... + sqrdmulh v24.8H, v3.8H, v1.H[5] // ..................*............................... + mul v3.8H, v3.8H, v1.H[4] // ...................*.............................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v5.8H, v27.8H, v23.8H // .............................*.................... + add v23.8H, v27.8H, v23.8H // ..............................*................... + mls v20.8H, v26.8H, v7.H[0] // ..........................*....................... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v25.8H, v14.8H, v7.H[0] // .................................*................ + mls v22.8H, v11.8H, v7.H[0] // ............................*..................... + mul v14.8H, v28.8H, v0.H[4] // ...............................*.................. + sqrdmulh v27.8H, v28.8H, v0.H[5] // ................................*................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mls v3.8H, v24.8H, v7.H[0] // ...........................*...................... + sqrdmulh v24.8H, v5.8H, v0.H[3] // ..................................*............... + mul v28.8H, v5.8H, v0.H[2] // ...................................*.............. + add v26.8H, v23.8H, v19.8H // .....................................*............ + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v19.8H, v23.8H, v19.8H // ........................................*......... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v8.8H, v25.8H, v20.8H // .......................................*.......... + add v21.8H, v25.8H, v20.8H // .........................................*........ + mls v14.8H, v27.8H, v7.H[0] // ......................................*........... + mul v12.8H, v26.8H, v29.8H // ...........................................*...... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + add v10.8H, v22.8H, v3.8H // ....................................*............. + sub v11.8H, v22.8H, v3.8H // ..........................................*....... + mls v28.8H, v24.8H, v7.H[0] // .............................................*.... + sqrdmulh v13.8H, v26.8H, v30.8H // ............................................*..... + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + mul v22.8H, v19.8H, v0.H[0] // ..............................................*... + sqrdmulh v15.8H, v19.8H, v0.H[1] // ...............................................*.. + mul v17.8H, v8.8H, v0.H[2] // ................................................*. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + sub v4.8H, v21.8H, v10.8H // .................................................* + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + // gap // .................................................. + + // original source code + // ldr q16, [x0, #256] // *................................................. + // ldr q5, [x0, #192] // .*................................................ + // ldr q31, [x0, #128] // ..*............................................... + // ldr q18, [x0, #320] // ...*.............................................. + // ldr q20, [x0, #64] // ....*............................................. + // ldr q11, [x0, #0] // .....*............................................ + // ldr q8, [x0, #384] // ......*........................................... + // ldr q10, [x0, #448] // .......*.......................................... + // sub v14.8H, v31.8H, v5.8H // ........*......................................... + // add v24.8H, v16.8H, v18.8H // ...........*...................................... + // add v21.8H, v11.8H, v20.8H // .............*.................................... + // sub v20.8H, v11.8H, v20.8H // ............*..................................... + // sub v19.8H, v16.8H, v18.8H // ..........*....................................... + // sub v18.8H, v8.8H, v10.8H // ...............*.................................. + // sqrdmulh v16.8H, v14.8H, v1.H[1] // ................*................................. + // sqrdmulh v9.8H, v20.8H, v0.H[7] // ....................*............................. + // mul v13.8H, v14.8H, v1.H[0] // .................*................................ + // add v15.8H, v8.8H, v10.8H // ..............*................................... + // sqrdmulh v14.8H, v18.8H, v1.H[5] // ........................*......................... + // mul v12.8H, v18.8H, v1.H[4] // .........................*........................ + // sqrdmulh v10.8H, v19.8H, v1.H[3] // ..................*............................... + // mul v25.8H, v19.8H, v1.H[2] // ...................*.............................. + // mul v19.8H, v20.8H, v0.H[6] // .....................*............................ + // add v20.8H, v31.8H, v5.8H // .........*........................................ + // add v2.8H, v24.8H, v15.8H // .......................*.......................... + // sub v28.8H, v24.8H, v15.8H // ......................*........................... + // mls v13.8H, v16.8H, v7.H[0] // ............................*..................... + // mls v12.8H, v14.8H, v7.H[0] // .................................*................ + // mls v25.8H, v10.8H, v7.H[0] // ..............................*................... + // sub v18.8H, v21.8H, v20.8H // ..........................*....................... + // add v11.8H, v21.8H, v20.8H // ...........................*...................... + // mul v14.8H, v28.8H, v0.H[4] // ...............................*.................. + // sqrdmulh v20.8H, v28.8H, v0.H[5] // ................................*................. + // mls v19.8H, v9.8H, v7.H[0] // .............................*.................... + // sqrdmulh v17.8H, v18.8H, v0.H[3] // ..................................*............... + // mul v28.8H, v18.8H, v0.H[2] // ...................................*.............. + // add v10.8H, v25.8H, v12.8H // ..........................................*....... + // add v24.8H, v11.8H, v2.8H // ....................................*............. + // mls v14.8H, v20.8H, v7.H[0] // ........................................*......... + // sub v8.8H, v19.8H, v13.8H // ......................................*........... + // sub v27.8H, v11.8H, v2.8H // .....................................*............ + // add v21.8H, v19.8H, v13.8H // .......................................*.......... + // sub v11.8H, v25.8H, v12.8H // ...........................................*...... + // mul v12.8H, v24.8H, v29.8H // .........................................*........ + // sqrdmulh v13.8H, v24.8H, v30.8H // .............................................*.... + // mls v28.8H, v17.8H, v7.H[0] // ............................................*..... + // mul v22.8H, v27.8H, v0.H[0] // ..............................................*... + // sqrdmulh v15.8H, v27.8H, v0.H[1] // ...............................................*.. + // mul v17.8H, v8.8H, v0.H[2] // ................................................*. + // sub v4.8H, v21.8H, v10.8H // .................................................* + + sub count, count, #1 +layer123_start: + ldr q16, [x0, #272] // ....e................................................................................... + // gap // ........................................................................................ + mul v25.8H, v11.8H, v0.H[4] // .............................................*.......................................... + sqrdmulh v3.8H, v11.8H, v0.H[5] // ..............................................*......................................... + ldr q5, [x0, #208] // ...e.................................................................................... + sqrdmulh v19.8H, v8.8H, v0.H[3] // ....................................*................................................... + ldr q31, [x0, #144] // ..e..................................................................................... + add v27.8H, v21.8H, v10.8H // ......................................................*................................. + mul v9.8H, v4.8H, v0.H[0] // .......................................................*................................ + sqrdmulh v24.8H, v4.8H, v0.H[1] // ........................................................*............................... + add v23.8H, v28.8H, v14.8H // ...........................................................*............................ + ldr q18, [x0, #336] // .....e.................................................................................. + ldr q20, [x0, #80] // .e...................................................................................... + ldr q11, [x0, #16] // e....................................................................................... + // gap // ........................................................................................ + mls v12.8H, v13.8H, v7.H[0] // ..........................................................................*............. + sub v4.8H, v28.8H, v14.8H // ..........................................................*............................. + mls v22.8H, v15.8H, v7.H[0] // ....................................................*................................... + ldr q8, [x0, #400] // ......e................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v2.8H, v27.8H, v30.8H // ............................................................................*........... + ldr q10, [x0, #464] // .......e................................................................................ + mul v27.8H, v27.8H, v29.8H // ...........................................................................*............ + // gap // ........................................................................................ + mls v25.8H, v3.8H, v7.H[0] // ...............................................*........................................ + mul v3.8H, v23.8H, v29.8H // ..............................................................................*......... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v6.8H, v23.8H, v30.8H // ...............................................................................*........ + // gap // ........................................................................................ + mls v17.8H, v19.8H, v7.H[0] // .....................................*.................................................. + mls v9.8H, v24.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + str q12, [x0], #(16) // ....................................................................................*... + sqrdmulh v26.8H, v4.8H, v0.H[1] // .............................................................*.......................... + sub v14.8H, v31.8H, v5.8H // .............e.......................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v4.8H, v4.8H, v0.H[0] // ............................................................*........................... + add v24.8H, v16.8H, v18.8H // ...................e.................................................................... + str q22, [x0, #240] // ....................................................................*................... + add v21.8H, v11.8H, v20.8H // .........e.............................................................................. + sub v20.8H, v11.8H, v20.8H // ........e............................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v27.8H, v2.8H, v7.H[0] // .............................................................................*.......... + sub v19.8H, v16.8H, v18.8H // ..................e..................................................................... + sub v11.8H, v17.8H, v25.8H // ...............................................................*........................ + sub v18.8H, v8.8H, v10.8H // .......................e................................................................ + add v28.8H, v17.8H, v25.8H // ................................................................*....................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v16.8H, v14.8H, v1.H[1] // ................e....................................................................... + str q9, [x0, #304] // .....................................................................*.................. + sqrdmulh v9.8H, v20.8H, v0.H[7] // ...........e............................................................................ + mul v13.8H, v14.8H, v1.H[0] // ...............e........................................................................ + add v15.8H, v8.8H, v10.8H // ........................e............................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v14.8H, v18.8H, v1.H[5] // ..........................e............................................................. + mul v12.8H, v18.8H, v1.H[4] // .........................e.............................................................. + sqrdmulh v10.8H, v19.8H, v1.H[3] // .....................e.................................................................. + mul v25.8H, v19.8H, v1.H[2] // ....................e................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v23.8H, v28.8H, v29.8H // .................................................................................*...... + mul v19.8H, v20.8H, v0.H[6] // ..........e............................................................................. + sqrdmulh v22.8H, v11.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v20.8H, v31.8H, v5.8H // ..............e......................................................................... + add v2.8H, v24.8H, v15.8H // .......................................e................................................ + sqrdmulh v31.8H, v28.8H, v30.8H // ..................................................................................*..... + sub v28.8H, v24.8H, v15.8H // ......................................e................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v13.8H, v16.8H, v7.H[0] // .................e...................................................................... + mls v12.8H, v14.8H, v7.H[0] // ...........................e............................................................ + mls v25.8H, v10.8H, v7.H[0] // ......................e................................................................. + sub v18.8H, v21.8H, v20.8H // ............................e........................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v5.8H, v11.8H, v0.H[0] // .................................................................*...................... + add v11.8H, v21.8H, v20.8H // .............................e.......................................................... + mul v14.8H, v28.8H, v0.H[4] // ........................................e............................................... + sqrdmulh v20.8H, v28.8H, v0.H[5] // .........................................e.............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.8H, v9.8H, v7.H[0] // ............e........................................................................... + mls v23.8H, v31.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + sqrdmulh v17.8H, v18.8H, v0.H[3] // ...............................e........................................................ + mul v28.8H, v18.8H, v0.H[2] // ..............................e......................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v4.8H, v26.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + mls v3.8H, v6.8H, v7.H[0] // ................................................................................*....... + add v10.8H, v25.8H, v12.8H // ............................................e........................................... + add v24.8H, v11.8H, v2.8H // .................................................e...................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v5.8H, v22.8H, v7.H[0] // ...................................................................*.................... + mls v14.8H, v20.8H, v7.H[0] // ..........................................e............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q27, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + sub v8.8H, v19.8H, v13.8H // .................................e...................................................... + sub v27.8H, v11.8H, v2.8H // ................................................e....................................... + add v21.8H, v19.8H, v13.8H // ..................................e..................................................... + sub v11.8H, v25.8H, v12.8H // ...........................................e............................................ + mul v12.8H, v24.8H, v29.8H // ........................................................................e............... + // gap // ........................................................................................ + sqrdmulh v13.8H, v24.8H, v30.8H // .........................................................................e.............. + mls v28.8H, v17.8H, v7.H[0] // ................................e....................................................... + str q4, [x0, #368] // ......................................................................*................. + // gap // ........................................................................................ + str q23, [x0, #176] // .......................................................................................* + str q3, [x0, #112] // ......................................................................................*. + // gap // ........................................................................................ + mul v22.8H, v27.8H, v0.H[0] // ..................................................e..................................... + sqrdmulh v15.8H, v27.8H, v0.H[1] // ...................................................e.................................... + mul v17.8H, v8.8H, v0.H[2] // ...................................e.................................................... + // gap // ........................................................................................ + str q5, [x0, #432] // .......................................................................*................ + sub v4.8H, v21.8H, v10.8H // .....................................................e.................................. + + // original source code + // ldr q8, [x0, #0] // ............e...........................................................................|...........e.......................................................................... + // ldr q9, [x0, #(1*(512/8))] // ...........e............................................................................|..........e........................................................................... + // ldr q10, [x0, #(2*(512/8))] // .....e..................................................................................|....e................................................................................. + // ldr q11, [x0, #(3*(512/8))] // ...e....................................................................................|..e................................................................................... + // ldr q12, [x0, #(4*(512/8))] // e.......................................................................................e...................................................................................... + // ldr q13, [x0, #(5*(512/8))] // ..........e.............................................................................|.........e............................................................................ + // ldr q14, [x0, #(6*(512/8))] // ................e.......................................................................|...............e...................................................................... + // ldr q15, [x0, #(7*(512/8))] // ..................e.....................................................................|.................e.................................................................... + // sub v24.8h, v8.8h, v9.8h // ................................e.......................................................|...............................e...................................................... + // add v8.8h, v8.8h, v9.8h // ...............................e........................................................|..............................e....................................................... + // mul v9.8h, v24.8h, v0.h[6] // ................................................e.......................................|...............................................e...................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ........................................e...............................................|.......................................e.............................................. + // mls v9.8h, v24.8h, v7.h[0] // ..............................................................e.........................|.............................................................e........................ + // sub v24.8h, v10.8h, v11.8h // ...........................e............................................................|..........................e........................................................... + // add v10.8h, v10.8h, v11.8h // ..................................................e.....................................|.................................................e.................................... + // mul v11.8h, v24.8h, v1.h[0] // .........................................e..............................................|........................................e............................................. + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ......................................e.................................................|.....................................e................................................ + // mls v11.8h, v24.8h, v7.h[0] // ......................................................e.................................|.....................................................e................................ + // sub v24.8h, v12.8h, v13.8h // ..................................e.....................................................|.................................e.................................................... + // add v12.8h, v12.8h, v13.8h // .............................e..........................................................|............................e......................................................... + // mul v13.8h, v24.8h, v1.h[2] // ..............................................e.........................................|.............................................e........................................ + // sqrdmulh v24.8h, v24.8h, v1.h[3] // .............................................e..........................................|............................................e......................................... + // mls v13.8h, v24.8h, v7.h[0] // ........................................................e...............................|.......................................................e.............................. + // sub v24.8h, v14.8h, v15.8h // ....................................e...................................................|...................................e.................................................. + // add v14.8h, v14.8h, v15.8h // ..........................................e.............................................|.........................................e............................................ + // mul v15.8h, v24.8h, v1.h[4] // ............................................e...........................................|...........................................e.......................................... + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...........................................e............................................|..........................................e........................................... + // mls v15.8h, v24.8h, v7.h[0] // .......................................................e................................|......................................................e............................... + // sub v24.8h, v8.8h, v10.8h // .........................................................e..............................|........................................................e............................. + // add v8.8h, v8.8h, v10.8h // ...........................................................e............................|..........................................................e........................... + // mul v10.8h, v24.8h, v0.h[2] // .................................................................e......................|................................................................e..................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................................................................e.......................|...............................................................e...................... + // mls v10.8h, v24.8h, v7.h[0] // ...............................................................................e........|..............................................................................e....... + // sub v24.8h, v9.8h, v11.8h // .........................................................................e..............|........................................................................e............. + // add v9.8h, v9.8h, v11.8h // ...........................................................................e............|..........................................................................e........... + // mul v11.8h, v24.8h, v0.h[2] // .....................................................................................e..|....................................................................................e. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....*...................................................................................|...*.................................................................................. + // mls v11.8h, v24.8h, v7.h[0] // .......................*................................................................|......................*............................................................... + // sub v24.8h, v12.8h, v14.8h // .....................................................e..................................|....................................................e................................. + // add v12.8h, v12.8h, v14.8h // ...................................................e....................................|..................................................e................................... + // mul v14.8h, v24.8h, v0.h[4] // ............................................................e...........................|...........................................................e.......................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .............................................................e..........................|............................................................e......................... + // mls v14.8h, v24.8h, v7.h[0] // .......................................................................e................|......................................................................e............... + // sub v24.8h, v13.8h, v15.8h // ............................................................................e...........|...........................................................................e.......... + // add v13.8h, v13.8h, v15.8h // ....................................................................e...................|...................................................................e.................. + // mul v15.8h, v24.8h, v0.h[4] // .*......................................................................................|*..................................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..*.....................................................................................|.*.................................................................................... + // mls v15.8h, v24.8h, v7.h[0] // ....................*...................................................................|...................*.................................................................. + // sub v24.8h, v8.8h, v12.8h // ..........................................................................e.............|.........................................................................e............ + // add v8.8h, v8.8h, v12.8h // .....................................................................e..................|....................................................................e................. + // mul v12.8h, v24.8h, v0.h[0] // ...................................................................................e....|..................................................................................e... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................................................e...|...................................................................................e.. + // mls v12.8h, v24.8h, v7.h[0] // ...............*........................................................................|..............*....................................................................... + // sub v24.8h, v9.8h, v13.8h // .......................................................................................e|...................................................................................... + // add v9.8h, v9.8h, v13.8h // ......*.................................................................................|.....*................................................................................ + // mul v13.8h, v24.8h, v0.h[0] // .......*................................................................................|......*............................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........*...............................................................................|.......*.............................................................................. + // mls v13.8h, v24.8h, v7.h[0] // ........................*...............................................................|.......................*.............................................................. + // sub v24.8h, v10.8h, v14.8h // ..............*.........................................................................|.............*........................................................................ + // add v10.8h, v10.8h, v14.8h // .........*..............................................................................|........*............................................................................. + // mul v14.8h, v24.8h, v0.h[0] // ............................*...........................................................|...........................*.......................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........................*.............................................................|.........................*............................................................ + // mls v14.8h, v24.8h, v7.h[0] // ..................................................................*.....................|.................................................................*.................... + // sub v24.8h, v11.8h, v15.8h // ...................................*....................................................|..................................*................................................... + // add v11.8h, v11.8h, v15.8h // .....................................*..................................................|....................................*................................................. + // mul v15.8h, v24.8h, v0.h[0] // ..........................................................*.............................|.........................................................*............................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .................................................*......................................|................................................*..................................... + // mls v15.8h, v24.8h, v7.h[0] // ......................................................................*.................|.....................................................................*................ + // str q12, [x0, #(4*(512/8))] // ..............................*.........................................................|.............................*........................................................ + // str q13, [x0, #(5*(512/8))] // .......................................*................................................|......................................*............................................... + // str q14, [x0, #(6*(512/8))] // ................................................................................*.......|...............................................................................*...... + // str q15, [x0, #(7*(512/8))] // ......................................................................................*.|.....................................................................................* + // mul v12.8h, v8.8h, v29.8h // .............................................................................e..........|............................................................................e......... + // sqrdmulh v8.8h, v8.8h, v30.8h // ..............................................................................e.........|.............................................................................e........ + // mls v12.8h, v8.8h, v7.h[0] // .............*..........................................................................|............*......................................................................... + // mul v13.8h, v9.8h, v29.8h // ...................*....................................................................|..................*................................................................... + // sqrdmulh v9.8h, v9.8h, v30.8h // .................*......................................................................|................*..................................................................... + // mls v13.8h, v9.8h, v7.h[0] // .................................*......................................................|................................*..................................................... + // mul v14.8h, v10.8h, v29.8h // .....................*..................................................................|....................*................................................................. + // sqrdmulh v10.8h, v10.8h, v30.8h // ......................*.................................................................|.....................*................................................................ + // mls v14.8h, v10.8h, v7.h[0] // ...................................................................*....................|..................................................................*................... + // mul v15.8h, v11.8h, v29.8h // ...............................................*........................................|..............................................*....................................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ....................................................*...................................|...................................................*.................................. + // mls v15.8h, v11.8h, v7.h[0] // ...............................................................*........................|..............................................................*....................... + // str q12, [x0], #(16) // .........................*..............................................................|........................*............................................................. + // str q13, [x0, #(-16 + 1*(512/8))] // ........................................................................*...............|.......................................................................*.............. + // str q14, [x0, #(-16 + 2*(512/8))] // ..................................................................................*.....|.................................................................................*.... + // str q15, [x0, #(-16 + 3*(512/8))] // .................................................................................*......|................................................................................*..... + + sub count, count, #1 + cbnz count, layer123_start + mul v6.8H, v11.8H, v0.H[4] // *..................................... + sqrdmulh v27.8H, v11.8H, v0.H[5] // .*.................................... + add v11.8H, v21.8H, v10.8H // ...*.................................. + sqrdmulh v3.8H, v8.8H, v0.H[3] // ..*................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v25.8H, v4.8H, v0.H[0] // ....*................................. + sqrdmulh v24.8H, v4.8H, v0.H[1] // .....*................................ + add v26.8H, v28.8H, v14.8H // ......*............................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v16.8H, v28.8H, v14.8H // ........*............................. + sqrdmulh v23.8H, v11.8H, v30.8H // ..........*........................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v6.8H, v27.8H, v7.H[0] // ............*......................... + mls v17.8H, v3.8H, v7.H[0] // ...............*...................... + mul v18.8H, v26.8H, v29.8H // .............*........................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v22.8H, v15.8H, v7.H[0] // .........*............................ + sqrdmulh v26.8H, v26.8H, v30.8H // ..............*....................... + mul v4.8H, v11.8H, v29.8H // ...........*.......................... + mls v25.8H, v24.8H, v7.H[0] // ................*..................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v12.8H, v13.8H, v7.H[0] // .......*.............................. + sqrdmulh v9.8H, v16.8H, v0.H[1] // ..................*................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + sub v19.8H, v17.8H, v6.8H // ......................*............... + add v21.8H, v17.8H, v6.8H // .......................*.............. + mul v3.8H, v16.8H, v0.H[0] // ...................*.................. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q22, [x0, #256] // ....................*................. + mls v18.8H, v26.8H, v7.H[0] // ...............................*...... + mls v4.8H, v23.8H, v7.H[0] // .....................*................ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mul v11.8H, v19.8H, v0.H[0] // ............................*......... + sqrdmulh v19.8H, v19.8H, v0.H[1] // ..........................*........... + mul v23.8H, v21.8H, v29.8H // .........................*............ + sqrdmulh v10.8H, v21.8H, v30.8H // ...........................*.......... + str q25, [x0, #320] // ........................*............. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + mls v3.8H, v9.8H, v7.H[0] // ..............................*....... + str q12, [x0], #(16) // .................*.................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q4, [x0, #48] // .................................*.... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q18, [x0, #112] // ....................................*. + mls v11.8H, v19.8H, v7.H[0] // ................................*..... + mls v23.8H, v10.8H, v7.H[0] // .............................*........ + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q3, [x0, #368] // ..................................*... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + str q11, [x0, #432] // .....................................* + str q23, [x0, #176] // ...................................*.. + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + // gap // ...................................... + + // original source code + // mul v25.8H, v11.8H, v0.H[4] // *..................................... + // sqrdmulh v3.8H, v11.8H, v0.H[5] // .*.................................... + // sqrdmulh v19.8H, v8.8H, v0.H[3] // ...*.................................. + // add v27.8H, v21.8H, v10.8H // ..*................................... + // mul v9.8H, v4.8H, v0.H[0] // ....*................................. + // sqrdmulh v24.8H, v4.8H, v0.H[1] // .....*................................ + // add v23.8H, v28.8H, v14.8H // ......*............................... + // mls v12.8H, v13.8H, v7.H[0] // ................*..................... + // sub v4.8H, v28.8H, v14.8H // .......*.............................. + // mls v22.8H, v15.8H, v7.H[0] // ............*......................... + // sqrdmulh v2.8H, v27.8H, v30.8H // ........*............................. + // mul v27.8H, v27.8H, v29.8H // ..............*....................... + // mls v25.8H, v3.8H, v7.H[0] // .........*............................ + // mul v3.8H, v23.8H, v29.8H // ...........*.......................... + // sqrdmulh v6.8H, v23.8H, v30.8H // .............*........................ + // mls v17.8H, v19.8H, v7.H[0] // ..........*........................... + // mls v9.8H, v24.8H, v7.H[0] // ...............*...................... + // str q12, [x0], #(16) // ..............................*....... + // sqrdmulh v26.8H, v4.8H, v0.H[1] // .................*.................... + // mul v4.8H, v4.8H, v0.H[0] // ....................*................. + // str q22, [x0, #240] // .....................*................ + // mls v27.8H, v2.8H, v7.H[0] // .......................*.............. + // sub v11.8H, v17.8H, v25.8H // ..................*................... + // add v28.8H, v17.8H, v25.8H // ...................*.................. + // str q9, [x0, #304] // ............................*......... + // mul v23.8H, v28.8H, v29.8H // ..........................*........... + // sqrdmulh v22.8H, v11.8H, v0.H[1] // .........................*............ + // sqrdmulh v31.8H, v28.8H, v30.8H // ...........................*.......... + // mul v5.8H, v11.8H, v0.H[0] // ........................*............. + // mls v23.8H, v31.8H, v7.H[0] // ..................................*... + // mls v4.8H, v26.8H, v7.H[0] // .............................*........ + // mls v3.8H, v6.8H, v7.H[0] // ......................*............... + // mls v5.8H, v22.8H, v7.H[0] // .................................*.... + // str q27, [x0, #48] // ...............................*...... + // str q4, [x0, #368] // ...................................*.. + // str q23, [x0, #176] // .....................................* + // str q3, [x0, #112] // ................................*..... + // str q5, [x0, #432] // ....................................*. + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s new file mode 100644 index 00000000..7973747d --- /dev/null +++ b/examples/opt/aarch64/intt_kyber_123_4567_manual_ld4_opt_m1_icestorm.s @@ -0,0 +1,1474 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_manual_ld4_opt_m1_icestorm + .global _intt_kyber_123_4567_manual_ld4_opt_m1_icestorm + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_manual_ld4_opt_m1_icestorm: +_intt_kyber_123_4567_manual_ld4_opt_m1_icestorm: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ld4 {v27.4S, v28.4S, v29.4S, v30.4S}, [x1] // *..................................................... + ldr q5, [x3], #16 // .........................................*............ + // gap // ...................................................... + // gap // ...................................................... + ldr q17, [x4, #64] // ..*................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + ldr q26, [x4, #80] // ...*.................................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + ldr q1, [x4, #32] // .....*................................................ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + ldr q15, [x4, #48] // ....*................................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + ldr q23, [x4], #(6*16) // ........*............................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + add v16.8H, v29.8H, v30.8H // ......*............................................... + sub v25.8H, v29.8H, v30.8H // .......*.............................................. + ldr q19, [x4, #-80] // .*.................................................... + // gap // ...................................................... + sub v21.8H, v27.8H, v28.8H // .........*............................................ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v17.8H, v25.8H, v17.8H // ...........*.......................................... + sqrdmulh v26.8H, v25.8H, v26.8H // ..........*........................................... + // gap // ...................................................... + // gap // ...................................................... + mul v6.8H, v21.8H, v1.8H // .............*........................................ + sqrdmulh v14.8H, v21.8H, v15.8H // ............*......................................... + // gap // ...................................................... + // gap // ...................................................... + add v11.8H, v27.8H, v28.8H // ..............*....................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v17.8H, v26.8H, v7.H[0] // ...............*...................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v6.8H, v14.8H, v7.H[0] // ................*..................................... + sub v30.8H, v11.8H, v16.8H // .................*.................................... + // gap // ...................................................... + // gap // ...................................................... + add v29.8H, v11.8H, v16.8H // ..................*................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v4.8H, v30.8H, v23.8H // ....................*................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sub v15.8H, v6.8H, v17.8H // .....................*................................ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v9.8H, v30.8H, v19.8H // ...................*.................................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v8.8H, v15.8H, v23.8H // .......................*.............................. + sqrdmulh v24.8H, v15.8H, v19.8H // ......................*............................... + // gap // ...................................................... + // gap // ...................................................... + add v26.8H, v6.8H, v17.8H // ........................*............................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v4.8H, v9.8H, v7.H[0] // .........................*............................ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v8.8H, v24.8H, v7.H[0] // ..........................*........................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v16.4S, v29.4S, v26.4S // ...........................*.......................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn2 v17.4S, v29.4S, v26.4S // ............................*......................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v14.4S, v4.4S, v8.4S // .............................*........................ + trn2 v21.4S, v4.4S, v8.4S // ..............................*....................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + trn2 v28.2D, v17.2D, v21.2D // ...............................*...................... + trn2 v31.2D, v16.2D, v14.2D // ................................*..................... + // gap // ...................................................... + // gap // ...................................................... + trn1 v27.2D, v16.2D, v14.2D // .................................*.................... + trn1 v11.2D, v17.2D, v21.2D // ..................................*................... + // gap // ...................................................... + // gap // ...................................................... + add v4.8H, v31.8H, v28.8H // ...................................*.................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + add v24.8H, v27.8H, v11.8H // ....................................*................. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqdmulh v20.8H, v4.8H, v7.H[1] // .......................................*.............. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sub v3.8H, v31.8H, v28.8H // .....................................*................ + sqdmulh v8.8H, v24.8H, v7.H[1] // ........................................*............. + // gap // ...................................................... + // gap // ...................................................... + sub v13.8H, v27.8H, v11.8H // ......................................*............... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + srshr v2.8H, v20.8H, #11 // ..........................................*........... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + srshr v19.8H, v8.8H, #11 // ...........................................*.......... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v10.8H, v13.8H, v5.H[3] // ..............................................*....... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v4.8H, v2.8H, v7.H[0] // ............................................*......... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v24.8H, v19.8H, v7.H[0] // .............................................*........ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v28.8H, v3.8H, v5.H[4] // ...............................................*...... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v0.8H, v13.8H, v5.H[2] // .................................................*.... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + add v9.8H, v24.8H, v4.8H // ..................................................*... + sub v21.8H, v24.8H, v4.8H // ...................................................*.. + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v19.8H, v3.8H, v5.H[5] // ................................................*..... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v0.8H, v10.8H, v7.H[0] // ....................................................*. + str q9, [x1], #(64) // .....................................................* + // gap // ...................................................... + // gap // ...................................................... + + // original source code + // ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x1] // *..................................................... + // ldr q30, [x4, #16] // .........*............................................ + // ldr q1, [x4, #64] // ..*................................................... + // ldr q14, [x4, #80] // ...*.................................................. + // ldr q8, [x4, #48] // .....*................................................ + // ldr q13, [x4, #32] // ....*................................................. + // add v22.8H, v17.8H, v18.8H // .......*.............................................. + // sub v6.8H, v17.8H, v18.8H // ........*............................................. + // ldr q29, [x4], #(6*16) // ......*............................................... + // sub v26.8H, v15.8H, v16.8H // ..........*........................................... + // sqrdmulh v31.8H, v6.8H, v14.8H // ............*......................................... + // mul v24.8H, v6.8H, v1.8H // ...........*.......................................... + // sqrdmulh v20.8H, v26.8H, v8.8H // ..............*....................................... + // mul v26.8H, v26.8H, v13.8H // .............*........................................ + // add v8.8H, v15.8H, v16.8H // ...............*...................................... + // mls v24.8H, v31.8H, v7.H[0] // ................*..................................... + // mls v26.8H, v20.8H, v7.H[0] // .................*.................................... + // sub v27.8H, v8.8H, v22.8H // ..................*................................... + // add v3.8H, v8.8H, v22.8H // ...................*.................................. + // sqrdmulh v19.8H, v27.8H, v30.8H // ......................*............................... + // mul v16.8H, v27.8H, v29.8H // ....................*................................. + // sub v2.8H, v26.8H, v24.8H // .....................*................................ + // sqrdmulh v14.8H, v2.8H, v30.8H // ........................*............................. + // mul v22.8H, v2.8H, v29.8H // .......................*.............................. + // add v2.8H, v26.8H, v24.8H // .........................*............................ + // mls v16.8H, v19.8H, v7.H[0] // ..........................*........................... + // mls v22.8H, v14.8H, v7.H[0] // ...........................*.......................... + // trn1 v23.4S, v3.4S, v2.4S // ............................*......................... + // trn2 v19.4S, v3.4S, v2.4S // .............................*........................ + // trn1 v20.4S, v16.4S, v22.4S // ..............................*....................... + // trn2 v13.4S, v16.4S, v22.4S // ...............................*...................... + // trn2 v27.2D, v19.2D, v13.2D // ................................*..................... + // trn2 v22.2D, v23.2D, v20.2D // .................................*.................... + // trn1 v26.2D, v23.2D, v20.2D // ..................................*................... + // trn1 v24.2D, v19.2D, v13.2D // ...................................*.................. + // add v23.8H, v22.8H, v27.8H // ....................................*................. + // add v10.8H, v26.8H, v24.8H // .....................................*................ + // sub v0.8H, v22.8H, v27.8H // .......................................*.............. + // sub v20.8H, v26.8H, v24.8H // .........................................*............ + // sqdmulh v28.8H, v23.8H, v7.H[1] // ......................................*............... + // sqdmulh v22.8H, v10.8H, v7.H[1] // ........................................*............. + // ldr q5, [x3], #16 // .*.................................................... + // srshr v28.8H, v28.8H, #11 // ..........................................*........... + // srshr v22.8H, v22.8H, #11 // ...........................................*.......... + // mls v23.8H, v28.8H, v7.H[0] // .............................................*........ + // mls v10.8H, v22.8H, v7.H[0] // ..............................................*....... + // sqrdmulh v30.8H, v20.8H, v5.H[3] // ............................................*......... + // mul v28.8H, v0.8H, v5.H[4] // ...............................................*...... + // sqrdmulh v19.8H, v0.8H, v5.H[5] // ...................................................*.. + // mul v0.8H, v20.8H, v5.H[2] // ................................................*..... + // add v25.8H, v10.8H, v23.8H // .................................................*.... + // sub v21.8H, v10.8H, v23.8H // ..................................................*... + // mls v0.8H, v30.8H, v7.H[0] // ....................................................*. + // str q25, [x1], #(64) // .....................................................* + + sub count, count, #1 +layer4567_start: + ld4 {v15.4S, v16.4S, v17.4S, v18.4S}, [x1] // e....................................................................... + mls v28.8H, v19.8H, v7.H[0] // .............................................*.......................... + sqrdmulh v19.8H, v21.8H, v5.H[1] // .............................................................*.......... + ldr q30, [x4, #16] // ..e..................................................................... + ldr q1, [x4, #64] // .....e.................................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q14, [x4, #80] // ......e................................................................. + mul v11.8H, v21.8H, v5.H[0] // ............................................................*........... + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v12.8H, v28.8H, v7.H[1] // .......................................................*................ + ldr q8, [x4, #48] // ....e................................................................... + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v27.8H, v0.8H, v7.H[1] // .................................................*...................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + ldr q13, [x4, #32] // ...e.................................................................... + mls v11.8H, v19.8H, v7.H[0] // ..............................................................*......... + // gap // ........................................................................ + // gap // ........................................................................ + add v22.8H, v17.8H, v18.8H // .............e.......................................................... + sub v6.8H, v17.8H, v18.8H // ............e........................................................... + ldr q29, [x4], #(6*16) // .e...................................................................... + // gap // ........................................................................ + sub v26.8H, v15.8H, v16.8H // .......e................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v31.8H, v6.8H, v14.8H // ...............e........................................................ + mul v24.8H, v6.8H, v1.8H // ..............e......................................................... + str q11, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + sqrdmulh v20.8H, v26.8H, v8.8H // ..........e............................................................. + mul v26.8H, v26.8H, v13.8H // .........e.............................................................. + // gap // ........................................................................ + // gap // ........................................................................ + add v8.8H, v15.8H, v16.8H // ........e............................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v13.8H, v27.8H, #11 // ..................................................*..................... + mls v24.8H, v31.8H, v7.H[0] // ................e....................................................... + // gap // ........................................................................ + // gap // ........................................................................ + mls v26.8H, v20.8H, v7.H[0] // ...........e............................................................ + sub v27.8H, v8.8H, v22.8H // .................e...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v3.8H, v8.8H, v22.8H // ..................e..................................................... + srshr v22.8H, v12.8H, #11 // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v19.8H, v27.8H, v30.8H // ....................e................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mul v16.8H, v27.8H, v29.8H // ...................e.................................................... + sub v2.8H, v26.8H, v24.8H // ......................e................................................. + // gap // ........................................................................ + // gap // ........................................................................ + mls v28.8H, v22.8H, v7.H[0] // .........................................................*.............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v14.8H, v2.8H, v30.8H // .........................e.............................................. + mul v22.8H, v2.8H, v29.8H // ........................e............................................... + // gap // ........................................................................ + // gap // ........................................................................ + add v2.8H, v26.8H, v24.8H // .......................e................................................ + mls v16.8H, v19.8H, v7.H[0] // .....................e.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v22.8H, v14.8H, v7.H[0] // ..........................e............................................. + mls v0.8H, v13.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v23.4S, v3.4S, v2.4S // ...........................e............................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v19.4S, v3.4S, v2.4S // ............................e........................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn1 v20.4S, v16.4S, v22.4S // .............................e.......................................... + trn2 v13.4S, v16.4S, v22.4S // ..............................e......................................... + // gap // ........................................................................ + // gap // ........................................................................ + sub v6.8H, v0.8H, v28.8H // ...............................................................*........ + add v28.8H, v0.8H, v28.8H // ................................................................*....... + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v27.2D, v19.2D, v13.2D // ................................e....................................... + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v22.2D, v23.2D, v20.2D // ...............................e........................................ + trn1 v26.2D, v23.2D, v20.2D // .................................e...................................... + trn1 v24.2D, v19.2D, v13.2D // ..................................e..................................... + str q28, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + add v23.8H, v22.8H, v27.8H // ..........................................e............................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v10.8H, v26.8H, v24.8H // .....................................e.................................. + sub v0.8H, v22.8H, v27.8H // .........................................e.............................. + // gap // ........................................................................ + // gap // ........................................................................ + sub v20.8H, v26.8H, v24.8H // ....................................e................................... + sqdmulh v28.8H, v23.8H, v7.H[1] // ....................................................e................... + // gap // ........................................................................ + // gap // ........................................................................ + sqdmulh v22.8H, v10.8H, v7.H[1] // ..............................................e......................... + mul v19.8H, v6.8H, v5.H[0] // .................................................................*...... + // gap // ........................................................................ + // gap // ........................................................................ + sqrdmulh v8.8H, v6.8H, v5.H[1] // ..................................................................*..... + ldr q5, [x3], #16 // ...................................e.................................... + // gap // ........................................................................ + // gap // ........................................................................ + srshr v28.8H, v28.8H, #11 // .....................................................e.................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + srshr v22.8H, v22.8H, #11 // ...............................................e........................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v19.8H, v8.8H, v7.H[0] // ...................................................................*.... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v23.8H, v28.8H, v7.H[0] // ......................................................e................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v10.8H, v22.8H, v7.H[0] // ................................................e....................... + sqrdmulh v30.8H, v20.8H, v5.H[3] // .......................................e................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q19, [x1, #-16] // .......................................................................* + mul v28.8H, v0.8H, v5.H[4] // ...........................................e............................ + sqrdmulh v19.8H, v0.8H, v5.H[5] // ............................................e........................... + // gap // ........................................................................ + mul v0.8H, v20.8H, v5.H[2] // ......................................e................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + add v25.8H, v10.8H, v23.8H // ...........................................................e............ + sub v21.8H, v10.8H, v23.8H // ..........................................................e............. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v0.8H, v30.8H, v7.H[0] // ........................................e............................... + str q25, [x1], #(64) // ....................................................................e... + // gap // ........................................................................ + // gap // ........................................................................ + + // original source code + // ld4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x1] // e.......................................................................e................................................................ + // ldr q0, [x4], #(6*16) // ..............e.........................................................|.............e.................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // ...e....................................................................|..e............................................................. + // ldr q1, [x4, #(-6*16 + 2*16)] // ..........e.............................................................|.........e...................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ........e...............................................................|.......e........................................................ + // ldr q2, [x4, #(-6*16 + 4*16)] // ....e...................................................................|...e............................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // .....e..................................................................|....e........................................................... + // sub v24.8h, v8.8h, v9.8h // ...............e........................................................|..............e................................................. + // add v8.8h, v8.8h, v9.8h // .....................e..................................................|....................e........................................... + // mul v9.8h, v24.8h, v1.8h // ....................e...................................................|...................e............................................ + // sqrdmulh v24.8h, v24.8h, v5.8h // ...................e....................................................|..................e............................................. + // mls v9.8h, v24.8h, v7.h[0] // ........................e...............................................|.......................e........................................ + // sub v24.8h, v10.8h, v11.8h // .............e..........................................................|............e................................................... + // add v10.8h, v10.8h, v11.8h // ............e...........................................................|...........e.................................................... + // mul v11.8h, v24.8h, v2.8h // .................e......................................................|................e............................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ................e.......................................................|...............e................................................ + // mls v11.8h, v24.8h, v7.h[0] // .......................e................................................|......................e......................................... + // sub v24.8h, v8.8h, v10.8h // .........................e..............................................|........................e....................................... + // add v8.8h, v8.8h, v10.8h // ..........................e.............................................|.........................e...................................... + // mul v10.8h, v24.8h, v0.8h // .............................e..........................................|............................e................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ............................e...........................................|...........................e.................................... + // mls v10.8h, v24.8h, v7.h[0] // ...................................e....................................|..................................e............................. + // sub v24.8h, v9.8h, v11.8h // ..............................e.........................................|.............................e.................................. + // add v9.8h, v9.8h, v11.8h // ..................................e.....................................|.................................e.............................. + // mul v11.8h, v24.8h, v0.8h // .................................e......................................|................................e............................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ................................e.......................................|...............................e................................ + // mls v11.8h, v24.8h, v7.h[0] // ....................................e...................................|...................................e............................ + // trn1 v25.4s, v8.4s, v9.4s // ......................................e.................................|.....................................e.......................... + // trn2 v26.4s, v8.4s, v9.4s // .......................................e................................|......................................e......................... + // trn1 v27.4s, v10.4s, v11.4s // ........................................e...............................|.......................................e........................ + // trn2 v28.4s, v10.4s, v11.4s // .........................................e..............................|........................................e....................... + // trn2 v10.2d, v25.2d, v27.2d // .............................................e..........................|............................................e................... + // trn2 v11.2d, v26.2d, v28.2d // ............................................e...........................|...........................................e.................... + // trn1 v8.2d, v25.2d, v27.2d // ..............................................e.........................|.............................................e.................. + // trn1 v9.2d, v26.2d, v28.2d // ...............................................e........................|..............................................e................. + // ldr q0, [x3], #16 // .........................................................e..............|........................................................e....... + // sub v24.8h, v8.8h, v9.8h // ....................................................e...................|...................................................e............ + // add v8.8h, v8.8h, v9.8h // ..................................................e.....................|.................................................e.............. + // mul v9.8h, v24.8h, v0.h[2] // ...................................................................e....|................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...............................................................e........|..............................................................e. + // mls v9.8h, v24.8h, v7.h[0] // ......................................................................e.|................................................................ + // sub v24.8h, v10.8h, v11.8h // ...................................................e....................|..................................................e............. + // add v10.8h, v10.8h, v11.8h // .................................................e......................|................................................e............... + // mul v11.8h, v24.8h, v0.h[4] // .................................................................e......|................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................................................................e.....|................................................................ + // mls v11.8h, v24.8h, v7.h[0] // .*......................................................................|*............................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ......................................................e.................|.....................................................e.......... + // srshr v25.8h, v25.8h, #11 // ...........................................................e............|..........................................................e..... + // mls v8.8h, v25.8h, v7.h[0] // ..............................................................e.........|.............................................................e.. + // sqdmulh v25.8h, v9.8h, v7.h[1] // .........*..............................................................|........*....................................................... + // srshr v25.8h, v25.8h, #11 // ......................*.................................................|.....................*.......................................... + // mls v9.8h, v25.8h, v7.h[0] // .....................................*..................................|....................................*........................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // .....................................................e..................|....................................................e........... + // srshr v25.8h, v25.8h, #11 // ..........................................................e.............|.........................................................e...... + // mls v10.8h, v25.8h, v7.h[0] // .............................................................e..........|............................................................e... + // sqdmulh v25.8h, v11.8h, v7.h[1] // .......*................................................................|......*......................................................... + // srshr v25.8h, v25.8h, #11 // ...........................*............................................|..........................*..................................... + // mls v11.8h, v25.8h, v7.h[0] // ...............................*........................................|..............................*................................. + // sub v24.8h, v8.8h, v10.8h // .....................................................................e..|................................................................ + // add v8.8h, v8.8h, v10.8h // ....................................................................e...|................................................................ + // mul v10.8h, v24.8h, v0.h[0] // ......*.................................................................|.....*.......................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*.....................................................................|.*.............................................................. + // mls v10.8h, v24.8h, v7.h[0] // ...........*............................................................|..........*..................................................... + // sub v24.8h, v9.8h, v11.8h // ..........................................*.............................|.........................................*...................... + // add v9.8h, v9.8h, v11.8h // ...........................................*............................|..........................................*..................... + // mul v11.8h, v24.8h, v0.h[0] // .......................................................*................|......................................................*......... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................................*...............|.......................................................*........ + // mls v11.8h, v24.8h, v7.h[0] // ............................................................*...........|...........................................................*.... + // str q8, [x1], #(64) // .......................................................................e|................................................................ + // str q9, [x1, #(-64 + 16*1)] // ................................................*.......................|...............................................*................ + // str q10, [x1, #(-64 + 16*2)] // ..................*.....................................................|.................*.............................................. + // str q11, [x1, #(-64 + 16*3)] // ................................................................*.......|...............................................................* + + sub count, count, #1 + cbnz count, layer4567_start + // gap // .................. + mls v28.8H, v19.8H, v7.H[0] // *................. + // gap // .................. + // gap // .................. + mul v27.8H, v21.8H, v5.H[0] // ..*............... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + sqdmulh v19.8H, v0.8H, v7.H[1] // ....*............. + // gap // .................. + // gap // .................. + // gap // .................. + sqdmulh v22.8H, v28.8H, v7.H[1] // ...*.............. + // gap // .................. + // gap // .................. + sqrdmulh v24.8H, v21.8H, v5.H[1] // .*................ + // gap // .................. + // gap // .................. + // gap // .................. + srshr v23.8H, v19.8H, #11 // .......*.......... + // gap // .................. + // gap // .................. + // gap // .................. + srshr v19.8H, v22.8H, #11 // ........*......... + // gap // .................. + // gap // .................. + // gap // .................. + mls v27.8H, v24.8H, v7.H[0] // .....*............ + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + mls v28.8H, v19.8H, v7.H[0] // .........*........ + // gap // .................. + mls v0.8H, v23.8H, v7.H[0] // ..........*....... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + sub v19.8H, v0.8H, v28.8H // ...........*...... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + mul v23.8H, v19.8H, v5.H[0] // ..............*... + sqrdmulh v19.8H, v19.8H, v5.H[1] // ...............*.. + str q27, [x1, #-32] // ......*........... + // gap // .................. + add v22.8H, v0.8H, v28.8H // ............*..... + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + str q22, [x1, #-48] // .............*.... + mls v23.8H, v19.8H, v7.H[0] // ................*. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + // gap // .................. + str q23, [x1, #-16] // .................* + // gap // .................. + // gap // .................. + // gap // .................. + + // original source code + // mls v28.8H, v19.8H, v7.H[0] // *................. + // sqrdmulh v19.8H, v21.8H, v5.H[1] // ....*............. + // mul v11.8H, v21.8H, v5.H[0] // .*................ + // sqdmulh v12.8H, v28.8H, v7.H[1] // ...*.............. + // sqdmulh v27.8H, v0.8H, v7.H[1] // ..*............... + // mls v11.8H, v19.8H, v7.H[0] // .......*.......... + // str q11, [x1, #-32] // .............*.... + // srshr v13.8H, v27.8H, #11 // .....*............ + // srshr v22.8H, v12.8H, #11 // ......*........... + // mls v28.8H, v22.8H, v7.H[0] // ........*......... + // mls v0.8H, v13.8H, v7.H[0] // .........*........ + // sub v6.8H, v0.8H, v28.8H // ..........*....... + // add v28.8H, v0.8H, v28.8H // ..............*... + // str q28, [x1, #-48] // ...............*.. + // mul v19.8H, v6.8H, v5.H[0] // ...........*...... + // sqrdmulh v8.8H, v6.8H, v5.H[1] // ............*..... + // mls v19.8H, v8.8H, v7.H[0] // ................*. + // str q19, [x1, #-16] // .................* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q31, [x0, #320] // .*................................ + ldr q22, [x0, #256] // *................................. + // gap // .................................. + // gap // .................................. + ldr q8, [x0, #448] // ....*............................. + ldr q16, [x0, #384] // ..*............................... + // gap // .................................. + // gap // .................................. + ldr q4, [x0, #64] // .......*.......................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + ldr q9, [x0, #0] // .....*............................ + // gap // .................................. + // gap // .................................. + // gap // .................................. + sub v3.8H, v22.8H, v31.8H // ......*........................... + add v6.8H, v22.8H, v31.8H // ........*......................... + ldr q18, [x0, #128] // ...*.............................. + // gap // .................................. + sub v23.8H, v16.8H, v8.8H // ..........*....................... + add v22.8H, v16.8H, v8.8H // .............*.................... + ldr q19, [x0, #192] // ..................*............... + // gap // .................................. + sqrdmulh v31.8H, v3.8H, v1.H[3] // .........*........................ + mul v13.8H, v3.8H, v1.H[2] // ...........*...................... + // gap // .................................. + // gap // .................................. + mul v2.8H, v23.8H, v1.H[4] // ..............*................... + sqrdmulh v11.8H, v23.8H, v1.H[5] // ...............*.................. + // gap // .................................. + // gap // .................................. + sub v25.8H, v9.8H, v4.8H // ............*..................... + add v9.8H, v9.8H, v4.8H // ................*................. + // gap // .................................. + // gap // .................................. + sub v21.8H, v18.8H, v19.8H // .............................*.... + mls v13.8H, v31.8H, v7.H[0] // .................*................ + // gap // .................................. + // gap // .................................. + add v26.8H, v18.8H, v19.8H // ....................*............. + mls v2.8H, v11.8H, v7.H[0] // ...................*.............. + // gap // .................................. + // gap // .................................. + sqrdmulh v23.8H, v25.8H, v0.H[7] // ...........................*...... + // gap // .................................. + // gap // .................................. + // gap // .................................. + add v31.8H, v9.8H, v26.8H // ............................*..... + sub v19.8H, v6.8H, v22.8H // ..........................*....... + // gap // .................................. + // gap // .................................. + add v6.8H, v6.8H, v22.8H // ........................*......... + sub v28.8H, v13.8H, v2.8H // ......................*........... + // gap // .................................. + // gap // .................................. + mul v15.8H, v25.8H, v0.H[6] // .....................*............ + // gap // .................................. + // gap // .................................. + // gap // .................................. + mul v16.8H, v28.8H, v0.H[4] // .......................*.......... + sqrdmulh v11.8H, v28.8H, v0.H[5] // .........................*........ + // gap // .................................. + // gap // .................................. + add v24.8H, v31.8H, v6.8H // ................................*. + mul v5.8H, v19.8H, v0.H[4] // ...............................*.. + // gap // .................................. + // gap // .................................. + mls v15.8H, v23.8H, v7.H[0] // .................................* + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v16.8H, v11.8H, v7.H[0] // ..............................*... + // gap // .................................. + // gap // .................................. + // gap // .................................. + + // original source code + // ldr q28, [x0, #256] // .*................................ + // ldr q18, [x0, #320] // *................................. + // ldr q23, [x0, #384] // ...*.............................. + // ldr q10, [x0, #128] // ........*......................... + // ldr q12, [x0, #448] // ..*............................... + // ldr q4, [x0, #0] // .....*............................ + // sub v17.8H, v28.8H, v18.8H // ......*........................... + // ldr q9, [x0, #64] // ....*............................. + // add v18.8H, v28.8H, v18.8H // .......*.......................... + // sqrdmulh v19.8H, v17.8H, v1.H[3] // ............*..................... + // sub v6.8H, v23.8H, v12.8H // .........*........................ + // mul v13.8H, v17.8H, v1.H[2] // .............*.................... + // sub v11.8H, v4.8H, v9.8H // ................*................. + // add v25.8H, v23.8H, v12.8H // ..........*....................... + // mul v2.8H, v6.8H, v1.H[4] // ..............*................... + // sqrdmulh v17.8H, v6.8H, v1.H[5] // ...............*.................. + // add v9.8H, v4.8H, v9.8H // .................*................ + // mls v13.8H, v19.8H, v7.H[0] // ...................*.............. + // ldr q21, [x0, #192] // ...........*...................... + // mls v2.8H, v17.8H, v7.H[0] // .....................*............ + // add v26.8H, v10.8H, v21.8H // ....................*............. + // mul v15.8H, v11.8H, v0.H[6] // ...........................*...... + // sub v5.8H, v13.8H, v2.8H // ..........................*....... + // mul v16.8H, v5.8H, v0.H[4] // ............................*..... + // add v6.8H, v18.8H, v25.8H // .........................*........ + // sqrdmulh v22.8H, v5.8H, v0.H[5] // .............................*.... + // sub v19.8H, v18.8H, v25.8H // ........................*......... + // sqrdmulh v20.8H, v11.8H, v0.H[7] // ......................*........... + // add v31.8H, v9.8H, v26.8H // .......................*.......... + // sub v21.8H, v10.8H, v21.8H // ..................*............... + // mls v16.8H, v22.8H, v7.H[0] // .................................* + // mul v5.8H, v19.8H, v0.H[4] // ...............................*.. + // add v24.8H, v31.8H, v6.8H // ..............................*... + // mls v15.8H, v20.8H, v7.H[0] // ................................*. + + sub count, count, #1 +layer123_start: + sub v27.8H, v31.8H, v6.8H // ................................................*....................................... + sqrdmulh v22.8H, v19.8H, v0.H[5] // .........................................*.............................................. + ldr q28, [x0, #272] // ....e................................................................................... + ldr q18, [x0, #336] // .....e.................................................................................. + // gap // ........................................................................................ + sub v19.8H, v9.8H, v26.8H // ............................*........................................................... + add v26.8H, v13.8H, v2.8H // ............................................*........................................... + ldr q23, [x0, #400] // ......e................................................................................. + ldr q10, [x0, #144] // ..e..................................................................................... + mul v20.8H, v24.8H, v29.8H // ........................................................................*............... + ldr q12, [x0, #464] // .......e................................................................................ + mul v31.8H, v27.8H, v0.H[0] // ..................................................*..................................... + mul v8.8H, v19.8H, v0.H[2] // ..............................*......................................................... + // gap // ........................................................................................ + sqrdmulh v14.8H, v19.8H, v0.H[3] // ...............................*........................................................ + // gap // ........................................................................................ + sqrdmulh v27.8H, v27.8H, v0.H[1] // ...................................................*.................................... + ldr q4, [x0, #16] // e....................................................................................... + // gap // ........................................................................................ + sub v17.8H, v28.8H, v18.8H // ..................e..................................................................... + mls v5.8H, v22.8H, v7.H[0] // ..........................................*............................................. + // gap // ........................................................................................ + ldr q9, [x0, #80] // .e...................................................................................... + add v18.8H, v28.8H, v18.8H // ...................e.................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v17.8H, v1.H[3] // .....................e.................................................................. + sub v6.8H, v23.8H, v12.8H // .......................e................................................................ + mul v13.8H, v17.8H, v1.H[2] // ....................e................................................................... + // gap // ........................................................................................ + mls v31.8H, v27.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + mul v27.8H, v21.8H, v1.H[0] // ...............*........................................................................ + sqrdmulh v17.8H, v21.8H, v1.H[1] // ................*....................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v8.8H, v14.8H, v7.H[0] // ................................*....................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v11.8H, v4.8H, v9.8H // ........e............................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v25.8H, v23.8H, v12.8H // ........................e............................................................... + mul v2.8H, v6.8H, v1.H[4] // .........................e.............................................................. + str q31, [x0, #256] // ....................................................................*................... + mls v27.8H, v17.8H, v7.H[0] // .................*...................................................................... + // gap // ........................................................................................ + sqrdmulh v17.8H, v6.8H, v1.H[5] // ..........................e............................................................. + sub v3.8H, v8.8H, v5.8H // ..........................................................*............................. + add v28.8H, v8.8H, v5.8H // ...........................................................*............................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v9.8H, v4.8H, v9.8H // .........e.............................................................................. + mls v13.8H, v19.8H, v7.H[0] // ......................e................................................................. + // gap // ........................................................................................ + ldr q21, [x0, #208] // ...e.................................................................................... + sub v12.8H, v15.8H, v27.8H // .................................*...................................................... + add v23.8H, v15.8H, v27.8H // ..................................*..................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v27.8H, v28.8H, v29.8H // ..............................................................................*......... + mls v2.8H, v17.8H, v7.H[0] // ...........................e............................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v17.8H, v12.8H, v0.H[2] // ...................................*.................................................... + add v19.8H, v23.8H, v26.8H // ......................................................*................................. + sqrdmulh v22.8H, v12.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v31.8H, v23.8H, v26.8H // .....................................................*.................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v4.8H, v19.8H, v29.8H // ...........................................................................*............ + add v26.8H, v10.8H, v21.8H // ..............e......................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v6.8H, v31.8H, v0.H[1] // ........................................................*............................... + mul v31.8H, v31.8H, v0.H[0] // .......................................................*................................ + mls v17.8H, v22.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v15.8H, v11.8H, v0.H[6] // ..........e............................................................................. + sqrdmulh v8.8H, v19.8H, v30.8H // ............................................................................*........... + sub v5.8H, v13.8H, v2.8H // ...........................................e............................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v28.8H, v30.8H // ...............................................................................*........ + mls v31.8H, v6.8H, v7.H[0] // .........................................................*.............................. + sub v12.8H, v17.8H, v16.8H // ...............................................................*........................ + // gap // ........................................................................................ + add v22.8H, v17.8H, v16.8H // ................................................................*....................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v16.8H, v5.8H, v0.H[4] // .............................................e.......................................... + add v6.8H, v18.8H, v25.8H // .......................................e................................................ + mul v19.8H, v22.8H, v29.8H // .................................................................................*...... + sqrdmulh v22.8H, v22.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v17.8H, v24.8H, v30.8H // .........................................................................*.............. + mul v24.8H, v12.8H, v0.H[0] // .................................................................*...................... + str q31, [x0, #320] // .....................................................................*.................. + // gap // ........................................................................................ + sqrdmulh v28.8H, v12.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v27.8H, v23.8H, v7.H[0] // ................................................................................*....... + mls v19.8H, v22.8H, v7.H[0] // ...................................................................................*.... + sqrdmulh v14.8H, v3.8H, v0.H[1] // .............................................................*.......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v20.8H, v17.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + mul v12.8H, v3.8H, v0.H[0] // ............................................................*........................... + // gap // ........................................................................................ + mls v4.8H, v8.8H, v7.H[0] // .............................................................................*.......... + mls v24.8H, v28.8H, v7.H[0] // ...................................................................*.................... + str q27, [x0, #128] // ......................................................................................*. + str q19, [x0, #192] // .......................................................................................* + sqrdmulh v22.8H, v5.8H, v0.H[5] // ..............................................e......................................... + // gap // ........................................................................................ + sub v19.8H, v18.8H, v25.8H // ......................................e................................................. + mls v12.8H, v14.8H, v7.H[0] // ..............................................................*......................... + str q20, [x0], #(16) // ....................................................................................*... + sqrdmulh v20.8H, v11.8H, v0.H[7] // ...........e............................................................................ + // gap // ........................................................................................ + str q24, [x0, #432] // .......................................................................*................ + add v31.8H, v9.8H, v26.8H // .............................e.......................................................... + sub v21.8H, v10.8H, v21.8H // .............e.......................................................................... + // gap // ........................................................................................ + mls v16.8H, v22.8H, v7.H[0] // ...............................................e........................................ + str q4, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + mul v5.8H, v19.8H, v0.H[4] // ........................................e............................................... + str q12, [x0, #368] // ......................................................................*................. + add v24.8H, v31.8H, v6.8H // .................................................e...................................... + // gap // ........................................................................................ + mls v15.8H, v20.8H, v7.H[0] // ............e........................................................................... + + // original source code + // ldr q8, [x0, #0] // ............e.........................................................................|.............e....................................................................... + // ldr q9, [x0, #(1*(512/8))] // ...............e......................................................................|................e.................................................................... + // ldr q10, [x0, #(2*(512/8))] // .....e................................................................................|......e.............................................................................. + // ldr q11, [x0, #(3*(512/8))] // ..................................e...................................................|...................................e................................................. + // ldr q12, [x0, #(4*(512/8))] // e.....................................................................................|.e................................................................................... + // ldr q13, [x0, #(5*(512/8))] // .e....................................................................................|..e.................................................................................. + // ldr q14, [x0, #(6*(512/8))] // ....e.................................................................................|.....e............................................................................... + // ldr q15, [x0, #(7*(512/8))] // .......e..............................................................................|........e............................................................................ + // sub v24.8h, v8.8h, v9.8h // ........................e.............................................................|.........................e........................................................... + // add v8.8h, v8.8h, v9.8h // ................................e.....................................................|.................................e................................................... + // mul v9.8h, v24.8h, v0.h[6] // ................................................e.....................................|.................................................e................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ............................................................................e.........|.............................................................................e....... + // mls v9.8h, v24.8h, v7.h[0] // .....................................................................................e|..................................................................................... + // sub v24.8h, v10.8h, v11.8h // ...............................................................................e......|................................................................................e.... + // add v10.8h, v10.8h, v11.8h // ............................................e.........................................|.............................................e....................................... + // mul v11.8h, v24.8h, v1.h[0] // .....................*................................................................|......................*.............................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ......................*...............................................................|.......................*............................................................. + // mls v11.8h, v24.8h, v7.h[0] // ............................*.........................................................|.............................*....................................................... + // sub v24.8h, v12.8h, v13.8h // .............e........................................................................|..............e...................................................................... + // add v12.8h, v12.8h, v13.8h // ................e.....................................................................|.................e................................................................... + // mul v13.8h, v24.8h, v1.h[2] // ...................e..................................................................|....................e................................................................ + // sqrdmulh v24.8h, v24.8h, v1.h[3] // .................e....................................................................|..................e.................................................................. + // mls v13.8h, v24.8h, v7.h[0] // .................................e....................................................|..................................e.................................................. + // sub v24.8h, v14.8h, v15.8h // ..................e...................................................................|...................e................................................................. + // add v14.8h, v14.8h, v15.8h // .........................e............................................................|..........................e.......................................................... + // mul v15.8h, v24.8h, v1.h[4] // ..........................e...........................................................|...........................e......................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[5] // .............................e........................................................|..............................e...................................................... + // mls v15.8h, v24.8h, v7.h[0] // ......................................e...............................................|.......................................e............................................. + // sub v24.8h, v8.8h, v10.8h // ..*...................................................................................|...*................................................................................. + // add v8.8h, v8.8h, v10.8h // ..............................................................................e.......|...............................................................................e..... + // mul v10.8h, v24.8h, v0.h[2] // .........*............................................................................|..........*.......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........*...........................................................................|...........*......................................................................... + // mls v10.8h, v24.8h, v7.h[0] // .......................*..............................................................|........................*............................................................ + // sub v24.8h, v9.8h, v11.8h // ...................................*..................................................|....................................*................................................ + // add v9.8h, v9.8h, v11.8h // ....................................*.................................................|.....................................*............................................... + // mul v11.8h, v24.8h, v0.h[2] // .......................................*..............................................|........................................*............................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .........................................*............................................|..........................................*.......................................... + // mls v11.8h, v24.8h, v7.h[0] // ...............................................*......................................|................................................*.................................... + // sub v24.8h, v12.8h, v14.8h // .........................................................................e............|..........................................................................e.......... + // add v12.8h, v12.8h, v14.8h // ........................................................e.............................|.........................................................e........................... + // mul v14.8h, v24.8h, v0.h[4] // ..................................................................................e...|...................................................................................e. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ......................................................................................|*.................................................................................... + // mls v14.8h, v24.8h, v7.h[0] // ..............*.......................................................................|...............*..................................................................... + // sub v24.8h, v13.8h, v15.8h // ..................................................e...................................|...................................................e................................. + // add v13.8h, v13.8h, v15.8h // ...*..................................................................................|....*................................................................................ + // mul v15.8h, v24.8h, v0.h[4] // .......................................................e..............................|........................................................e............................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ........................................................................e.............|.........................................................................e........... + // mls v15.8h, v24.8h, v7.h[0] // ................................................................................e.....|.................................................................................e... + // sub v24.8h, v8.8h, v12.8h // ......................................................................................*..................................................................................... + // add v8.8h, v8.8h, v12.8h // ....................................................................................e.|..................................................................................... + // mul v12.8h, v24.8h, v0.h[0] // ........*.............................................................................|.........*........................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........*..........................................................................|............*........................................................................ + // mls v12.8h, v24.8h, v7.h[0] // ....................*.................................................................|.....................*............................................................... + // sub v24.8h, v9.8h, v13.8h // ..........................................*...........................................|...........................................*......................................... + // add v9.8h, v9.8h, v13.8h // ........................................*.............................................|.........................................*........................................... + // mul v13.8h, v24.8h, v0.h[0] // ..............................................*.......................................|...............................................*..................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .............................................*........................................|..............................................*...................................... + // mls v13.8h, v24.8h, v7.h[0] // ....................................................*.................................|.....................................................*............................... + // sub v24.8h, v10.8h, v14.8h // ..............................*.......................................................|...............................*..................................................... + // add v10.8h, v10.8h, v14.8h // ...............................*......................................................|................................*.................................................... + // mul v14.8h, v24.8h, v0.h[0] // ...................................................................*..................|....................................................................*................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .................................................................*....................|..................................................................*.................. + // mls v14.8h, v24.8h, v7.h[0] // ..........................................................................*...........|...........................................................................*......... + // sub v24.8h, v11.8h, v15.8h // .....................................................*................................|......................................................*.............................. + // add v11.8h, v11.8h, v15.8h // ......................................................*...............................|.......................................................*............................. + // mul v15.8h, v24.8h, v0.h[0] // ............................................................*.........................|.............................................................*....................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*.......................|...............................................................*..................... + // mls v15.8h, v24.8h, v7.h[0] // .....................................................................*................|......................................................................*.............. + // str q12, [x0, #(4*(512/8))] // ...........................*..........................................................|............................*........................................................ + // str q13, [x0, #(5*(512/8))] // .............................................................*........................|..............................................................*...................... + // str q14, [x0, #(6*(512/8))] // ...................................................................................*..|....................................................................................* + // str q15, [x0, #(7*(512/8))] // .............................................................................*........|..............................................................................*...... + // mul v12.8h, v8.8h, v29.8h // ......*...............................................................................|.......*............................................................................. + // sqrdmulh v8.8h, v8.8h, v30.8h // ...........................................................*..........................|............................................................*........................ + // mls v12.8h, v8.8h, v7.h[0] // ..................................................................*...................|...................................................................*................. + // mul v13.8h, v9.8h, v29.8h // ...........................................*..........................................|............................................*........................................ + // sqrdmulh v9.8h, v9.8h, v30.8h // .................................................*....................................|..................................................*.................................. + // mls v13.8h, v9.8h, v7.h[0] // ....................................................................*.................|.....................................................................*............... + // mul v14.8h, v10.8h, v29.8h // .....................................*................................................|......................................*.............................................. + // sqrdmulh v10.8h, v10.8h, v30.8h // ...................................................*..................................|....................................................*................................ + // mls v14.8h, v10.8h, v7.h[0] // ...............................................................*......................|................................................................*.................... + // mul v15.8h, v11.8h, v29.8h // .........................................................*............................|..........................................................*.......................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ..........................................................*...........................|...........................................................*......................... + // mls v15.8h, v11.8h, v7.h[0] // ................................................................*.....................|.................................................................*................... + // str q12, [x0], #(16) // ...........................................................................*..........|............................................................................*........ + // str q13, [x0, #(-16 + 1*(512/8))] // .................................................................................*....|..................................................................................*.. + // str q14, [x0, #(-16 + 2*(512/8))] // ......................................................................*...............|.......................................................................*............. + // str q15, [x0, #(-16 + 3*(512/8))] // .......................................................................*..............|........................................................................*............ + + sub count, count, #1 + cbnz count, layer123_start + mul v25.8H, v21.8H, v1.H[0] // ...........*.......................................... + sqrdmulh v3.8H, v21.8H, v1.H[1] // ............*......................................... + // gap // ...................................................... + // gap // ...................................................... + sub v12.8H, v31.8H, v6.8H // *..................................................... + sub v23.8H, v9.8H, v26.8H // ..*................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v22.8H, v24.8H, v29.8H // ....*................................................. + sqrdmulh v28.8H, v19.8H, v0.H[5] // .*.................................................... + // gap // ...................................................... + // gap // ...................................................... + mls v25.8H, v3.8H, v7.H[0] // ...............*...................................... + sqrdmulh v3.8H, v12.8H, v0.H[1] // ........*............................................. + // gap // ...................................................... + // gap // ...................................................... + mul v11.8H, v23.8H, v0.H[2] // ......*............................................... + sqrdmulh v4.8H, v23.8H, v0.H[3] // .......*.............................................. + // gap // ...................................................... + // gap // ...................................................... + add v27.8H, v13.8H, v2.8H // ...*.................................................. + sqrdmulh v18.8H, v24.8H, v30.8H // ....................................*................. + // gap // ...................................................... + // gap // ...................................................... + sub v10.8H, v15.8H, v25.8H // ..................*................................... + add v25.8H, v15.8H, v25.8H // ...................*.................................. + // gap // ...................................................... + // gap // ...................................................... + mls v11.8H, v4.8H, v7.H[0] // .............*........................................ + mls v5.8H, v28.8H, v7.H[0] // .........*............................................ + // gap // ...................................................... + // gap // ...................................................... + sub v21.8H, v25.8H, v27.8H // ........................*............................. + mls v22.8H, v18.8H, v7.H[0] // ...........................................*.......... + // gap // ...................................................... + // gap // ...................................................... + mul v13.8H, v10.8H, v0.H[2] // .....................*................................ + sqrdmulh v14.8H, v10.8H, v0.H[3] // .......................*.............................. + // gap // ...................................................... + // gap // ...................................................... + sub v9.8H, v11.8H, v5.8H // ................*..................................... + mul v24.8H, v21.8H, v0.H[0] // ...........................*.......................... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v19.8H, v21.8H, v0.H[1] // ..........................*........................... + add v17.8H, v11.8H, v5.8H // .................*.................................... + str q22, [x0], #(16) // ..................................................*... + // gap // ...................................................... + mls v13.8H, v14.8H, v7.H[0] // ............................*......................... + mul v26.8H, v9.8H, v0.H[0] // ............................................*......... + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v4.8H, v9.8H, v0.H[1] // ..........................................*........... + mul v6.8H, v17.8H, v29.8H // ....................*................................. + // gap // ...................................................... + // gap // ...................................................... + sqrdmulh v5.8H, v17.8H, v30.8H // ..............................*....................... + mul v31.8H, v12.8H, v0.H[0] // .....*................................................ + // gap // ...................................................... + // gap // ...................................................... + add v11.8H, v13.8H, v16.8H // .................................*.................... + add v28.8H, v25.8H, v27.8H // ......................*............................... + // gap // ...................................................... + // gap // ...................................................... + sub v16.8H, v13.8H, v16.8H // ................................*..................... + mls v26.8H, v4.8H, v7.H[0] // .................................................*.... + // gap // ...................................................... + // gap // ...................................................... + mul v18.8H, v11.8H, v29.8H // ..................................*................... + sqrdmulh v10.8H, v11.8H, v30.8H // ...................................*.................. + // gap // ...................................................... + // gap // ...................................................... + mul v2.8H, v16.8H, v0.H[0] // .....................................*................ + sqrdmulh v11.8H, v16.8H, v0.H[1] // .......................................*.............. + // gap // ...................................................... + // gap // ...................................................... + str q26, [x0, #368] // .....................................................* + mls v31.8H, v3.8H, v7.H[0] // ..........*........................................... + sqrdmulh v8.8H, v28.8H, v30.8H // .............................*........................ + // gap // ...................................................... + mls v18.8H, v10.8H, v7.H[0] // .........................................*............ + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + mul v20.8H, v28.8H, v29.8H // .........................*............................ + mls v2.8H, v11.8H, v7.H[0] // ..............................................*....... + // gap // ...................................................... + // gap // ...................................................... + mls v6.8H, v5.8H, v7.H[0] // ........................................*............. + str q31, [x0, #240] // ..............*....................................... + // gap // ...................................................... + // gap // ...................................................... + str q18, [x0, #176] // ................................................*..... + mls v24.8H, v19.8H, v7.H[0] // ...............................*...................... + // gap // ...................................................... + // gap // ...................................................... + mls v20.8H, v8.8H, v7.H[0] // .............................................*........ + str q2, [x0, #432] // ...................................................*.. + // gap // ...................................................... + // gap // ...................................................... + str q6, [x0, #112] // ...............................................*...... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + str q24, [x0, #304] // ......................................*............... + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + str q20, [x0, #48] // ....................................................*. + // gap // ...................................................... + // gap // ...................................................... + // gap // ...................................................... + + // original source code + // sub v27.8H, v31.8H, v6.8H // ..*................................................... + // sqrdmulh v22.8H, v19.8H, v0.H[5] // .....*................................................ + // sub v19.8H, v9.8H, v26.8H // ...*.................................................. + // add v26.8H, v13.8H, v2.8H // ..........*........................................... + // mul v20.8H, v24.8H, v29.8H // ....*................................................. + // mul v31.8H, v27.8H, v0.H[0] // ..............................*....................... + // mul v8.8H, v19.8H, v0.H[2] // ........*............................................. + // sqrdmulh v14.8H, v19.8H, v0.H[3] // .........*............................................ + // sqrdmulh v27.8H, v27.8H, v0.H[1] // .......*.............................................. + // mls v5.8H, v22.8H, v7.H[0] // ...............*...................................... + // mls v31.8H, v27.8H, v7.H[0] // ........................................*............. + // mul v27.8H, v21.8H, v1.H[0] // *..................................................... + // sqrdmulh v17.8H, v21.8H, v1.H[1] // .*.................................................... + // mls v8.8H, v14.8H, v7.H[0] // ..............*....................................... + // str q31, [x0, #256] // ..............................................*....... + // mls v27.8H, v17.8H, v7.H[0] // ......*............................................... + // sub v3.8H, v8.8H, v5.8H // ....................*................................. + // add v28.8H, v8.8H, v5.8H // .......................*.............................. + // sub v12.8H, v15.8H, v27.8H // ............*......................................... + // add v23.8H, v15.8H, v27.8H // .............*........................................ + // mul v27.8H, v28.8H, v29.8H // ............................*......................... + // mul v17.8H, v12.8H, v0.H[2] // ..................*................................... + // add v19.8H, v23.8H, v26.8H // ................................*..................... + // sqrdmulh v22.8H, v12.8H, v0.H[3] // ...................*.................................. + // sub v31.8H, v23.8H, v26.8H // ................*..................................... + // mul v4.8H, v19.8H, v29.8H // ...........................................*.......... + // sqrdmulh v6.8H, v31.8H, v0.H[1] // ......................*............................... + // mul v31.8H, v31.8H, v0.H[0] // .....................*................................ + // mls v17.8H, v22.8H, v7.H[0] // .........................*............................ + // sqrdmulh v8.8H, v19.8H, v30.8H // .........................................*............ + // sqrdmulh v23.8H, v28.8H, v30.8H // .............................*........................ + // mls v31.8H, v6.8H, v7.H[0] // ................................................*..... + // sub v12.8H, v17.8H, v16.8H // .................................*.................... + // add v22.8H, v17.8H, v16.8H // ...............................*...................... + // mul v19.8H, v22.8H, v29.8H // ...................................*.................. + // sqrdmulh v22.8H, v22.8H, v30.8H // ....................................*................. + // sqrdmulh v17.8H, v24.8H, v30.8H // ...........*.......................................... + // mul v24.8H, v12.8H, v0.H[0] // .....................................*................ + // str q31, [x0, #320] // ....................................................*. + // sqrdmulh v28.8H, v12.8H, v0.H[1] // ......................................*............... + // mls v27.8H, v23.8H, v7.H[0] // .............................................*........ + // mls v19.8H, v22.8H, v7.H[0] // ..........................................*........... + // sqrdmulh v14.8H, v3.8H, v0.H[1] // ...........................*.......................... + // mls v20.8H, v17.8H, v7.H[0] // .................*.................................... + // mul v12.8H, v3.8H, v0.H[0] // ..........................*........................... + // mls v4.8H, v8.8H, v7.H[0] // .................................................*.... + // mls v24.8H, v28.8H, v7.H[0] // ............................................*......... + // str q27, [x0, #128] // ...................................................*.. + // str q19, [x0, #192] // ...............................................*...... + // mls v12.8H, v14.8H, v7.H[0] // ..................................*................... + // str q20, [x0], #(16) // ........................*............................. + // str q24, [x0, #432] // ..................................................*... + // str q4, [x0, #48] // .....................................................* + // str q12, [x0, #368] // .......................................*.............. + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_kyber_123_4567_opt_a55.s b/examples/opt/aarch64/intt_kyber_123_4567_opt_a55.s new file mode 100644 index 00000000..a524ab50 --- /dev/null +++ b/examples/opt/aarch64/intt_kyber_123_4567_opt_a55.s @@ -0,0 +1,1516 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_opt_a55 + .global _intt_kyber_123_4567_opt_a55 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_opt_a55: +_intt_kyber_123_4567_opt_a55: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q23, [x1, #0] // *.......... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q27, [x1, #16] // .*......... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q3, [x1, #32] // ..*........ + // gap // ........... + // gap // ........... + // gap // ........... + ldr q28, [x1, #48] // ...*....... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q29, [x4], #(6*16) // .....*..... + // gap // ........... + // gap // ........... + // gap // ........... + trn1 v26.4S, v3.4S, v28.4S // ....*...... + // gap // ........... + ldr q20, [x4, #-80] // ......*.... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q24, [x4, #-64] // .......*... + // gap // ........... + // gap // ........... + // gap // ........... + ldr q11, [x4, #-48] // ........*.. + // gap // ........... + // gap // ........... + // gap // ........... + ldr q14, [x4, #-32] // .........*. + // gap // ........... + // gap // ........... + // gap // ........... + ldr q25, [x4, #-16] // ..........* + // gap // ........... + + // original source code + // ldr q23, [x1, #0] // *.......... + // ldr q27, [x1, #16] // .*......... + // ldr q3, [x1, #32] // ..*........ + // ldr q28, [x1, #48] // ...*....... + // trn1 v26.4S, v3.4S, v28.4S // .....*..... + // ldr q29, [x4], #(6*16) // ....*...... + // ldr q20, [x4, #-80] // ......*.... + // ldr q24, [x4, #-64] // .......*... + // ldr q11, [x4, #-48] // ........*.. + // ldr q14, [x4, #-32] // .........*. + // ldr q25, [x4, #-16] // ..........* + + sub count, count, #1 +layer4567_start: + trn1 v19.4S, v23.4S, v27.4S // ....*.............................................................................. + // gap // ................................................................................... + trn2 v23.4S, v23.4S, v27.4S // .....*............................................................................. + // gap // ................................................................................... + trn2 v22.4S, v3.4S, v28.4S // .......*........................................................................... + // gap // ................................................................................... + trn2 v28.2D, v19.2D, v26.2D // ........*.......................................................................... + // gap // ................................................................................... + trn1 v19.2D, v19.2D, v26.2D // ..........*........................................................................ + // gap // ................................................................................... + trn2 v0.2D, v23.2D, v22.2D // .........*......................................................................... + // gap // ................................................................................... + trn1 v23.2D, v23.2D, v22.2D // ...........*....................................................................... + // gap // ................................................................................... + sub v22.8H, v28.8H, v0.8H // .......................*........................................................... + // gap // ................................................................................... + add v28.8H, v28.8H, v0.8H // ........................*.......................................................... + // gap // ................................................................................... + sub v0.8H, v19.8H, v23.8H // ..................*................................................................ + // gap // ................................................................................... + add v19.8H, v19.8H, v23.8H // ...................*............................................................... + // gap // ................................................................................... + mul v23.8H, v22.8H, v14.8H // .........................*......................................................... + // gap // ................................................................................... + mul v27.8H, v0.8H, v24.8H // ....................*.............................................................. + // gap // ................................................................................... + sqrdmulh v0.8H, v0.8H, v11.8H // .....................*............................................................. + // gap // ................................................................................... + sqrdmulh v22.8H, v22.8H, v25.8H // ..........................*........................................................ + // gap // ................................................................................... + sub v24.8H, v19.8H, v28.8H // ............................*...................................................... + // gap // ................................................................................... + add v19.8H, v19.8H, v28.8H // .............................*..................................................... + // gap // ................................................................................... + mls v27.8H, v0.8H, v7.H[0] // ......................*............................................................ + // gap // ................................................................................... + mls v23.8H, v22.8H, v7.H[0] // ...........................*....................................................... + // gap // ................................................................................... + mul v22.8H, v24.8H, v29.8H // ..............................*.................................................... + // gap // ................................................................................... + sqrdmulh v28.8H, v24.8H, v20.8H // ...............................*................................................... + // gap // ................................................................................... + ldr q0, [x3], #16 // ..............................................*.................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v24.8H, v27.8H, v23.8H // .................................*................................................. + // gap // ................................................................................... + mls v22.8H, v28.8H, v7.H[0] // ................................*.................................................. + // gap // ................................................................................... + add v23.8H, v27.8H, v23.8H // ..................................*................................................ + // gap // ................................................................................... + mul v28.8H, v24.8H, v29.8H // ...................................*............................................... + // gap // ................................................................................... + sqrdmulh v27.8H, v24.8H, v20.8H // ....................................*.............................................. + // gap // ................................................................................... + trn1 v24.4S, v19.4S, v23.4S // ......................................*............................................ + // gap // ................................................................................... + trn2 v19.4S, v19.4S, v23.4S // .......................................*........................................... + // gap // ................................................................................... + ldr q23, [x1, #64] // e.................................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v28.8H, v27.8H, v7.H[0] // .....................................*............................................. + // gap // ................................................................................... + ldr q27, [x1, #80] // .e................................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q3, [x1, #96] // ..e................................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v26.4S, v22.4S, v28.4S // ........................................*.......................................... + // gap // ................................................................................... + trn2 v22.4S, v22.4S, v28.4S // .........................................*......................................... + // gap // ................................................................................... + ldr q28, [x1, #112] // ...e............................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v29.2D, v24.2D, v26.2D // ..........................................*........................................ + // gap // ................................................................................... + trn2 v20.2D, v19.2D, v22.2D // ...........................................*....................................... + // gap // ................................................................................... + trn1 v24.2D, v24.2D, v26.2D // ............................................*...................................... + // gap // ................................................................................... + trn1 v19.2D, v19.2D, v22.2D // .............................................*..................................... + // gap // ................................................................................... + sub v22.8H, v29.8H, v20.8H // ....................................................*.............................. + // gap // ................................................................................... + sub v26.8H, v24.8H, v19.8H // ...............................................*................................... + // gap // ................................................................................... + add v19.8H, v24.8H, v19.8H // ................................................*.................................. + // gap // ................................................................................... + mul v24.8H, v22.8H, v0.H[4] // ......................................................*............................ + // gap // ................................................................................... + mul v11.8H, v26.8H, v0.H[2] // .................................................*................................. + // gap // ................................................................................... + sqrdmulh v26.8H, v26.8H, v0.H[3] // ..................................................*................................ + // gap // ................................................................................... + sqrdmulh v22.8H, v22.8H, v0.H[5] // .......................................................*........................... + // gap // ................................................................................... + add v29.8H, v29.8H, v20.8H // .....................................................*............................. + // gap // ................................................................................... + sqdmulh v20.8H, v19.8H, v7.H[1] // .........................................................*......................... + // gap // ................................................................................... + mls v11.8H, v26.8H, v7.H[0] // ...................................................*............................... + // gap // ................................................................................... + mls v24.8H, v22.8H, v7.H[0] // ........................................................*.......................... + // gap // ................................................................................... + sqdmulh v22.8H, v29.8H, v7.H[1] // ............................................................*...................... + // gap // ................................................................................... + srshr v26.8H, v20.8H, #11 // ..........................................................*........................ + // gap // ................................................................................... + sqdmulh v20.8H, v11.8H, v7.H[1] // ...............................................................*................... + // gap // ................................................................................... + sqdmulh v14.8H, v24.8H, v7.H[1] // ..................................................................*................ + // gap // ................................................................................... + mls v19.8H, v26.8H, v7.H[0] // ...........................................................*....................... + // gap // ................................................................................... + srshr v22.8H, v22.8H, #11 // .............................................................*..................... + // gap // ................................................................................... + srshr v26.8H, v20.8H, #11 // ................................................................*.................. + // gap // ................................................................................... + srshr v20.8H, v14.8H, #11 // ...................................................................*............... + // gap // ................................................................................... + mls v29.8H, v22.8H, v7.H[0] // ..............................................................*.................... + // gap // ................................................................................... + mls v11.8H, v26.8H, v7.H[0] // .................................................................*................. + // gap // ................................................................................... + mls v24.8H, v20.8H, v7.H[0] // ....................................................................*.............. + // gap // ................................................................................... + trn1 v26.4S, v3.4S, v28.4S // ......e............................................................................ + // gap // ................................................................................... + sub v22.8H, v19.8H, v29.8H // .....................................................................*............. + // gap // ................................................................................... + add v19.8H, v19.8H, v29.8H // ......................................................................*............ + // gap // ................................................................................... + sub v29.8H, v11.8H, v24.8H // ..........................................................................*........ + // gap // ................................................................................... + mul v20.8H, v22.8H, v0.H[0] // .......................................................................*........... + // gap // ................................................................................... + sqrdmulh v22.8H, v22.8H, v0.H[1] // ........................................................................*.......... + // gap // ................................................................................... + mul v14.8H, v29.8H, v0.H[0] // ............................................................................*...... + // gap // ................................................................................... + sqrdmulh v0.8H, v29.8H, v0.H[1] // .............................................................................*..... + // gap // ................................................................................... + add v24.8H, v11.8H, v24.8H // ...........................................................................*....... + // gap // ................................................................................... + mls v20.8H, v22.8H, v7.H[0] // .........................................................................*......... + // gap // ................................................................................... + str q19, [x1], #(64) // ...............................................................................*... + // gap // ................................................................................... + mls v14.8H, v0.8H, v7.H[0] // ..............................................................................*.... + // gap // ................................................................................... + str q24, [x1, #-48] // ................................................................................*.. + // gap // ................................................................................... + ldr q29, [x4], #(6*16) // ............e...................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q20, [x1, #-32] // .................................................................................*. + // gap // ................................................................................... + ldr q20, [x4, #-80] // .............e..................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q14, [x1, #-16] // ..................................................................................* + // gap // ................................................................................... + ldr q24, [x4, #-64] // ..............e.................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q11, [x4, #-48] // ...............e................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q14, [x4, #-32] // ................e.................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q25, [x4, #-16] // .................e................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + + // original source code + // ldr q8, [x1, #(16*0)] // e.....................................................|............................e................................................. + // ldr q9, [x1, #(16*1)] // ..e...................................................|..............................e............................................... + // ldr q10, [x1, #(16*2)] // ...e..................................................|...............................e.............................................. + // ldr q11, [x1, #(16*3)] // ......e...............................................|..................................e........................................... + // trn1 v25.4s, v8.4s, v9.4s // ......................................................*.............................................................................. + // trn2 v26.4s, v8.4s, v9.4s // ......................................................|*............................................................................. + // trn1 v27.4s, v10.4s, v11.4s // .................................e....................|.............................................................e................ + // trn2 v28.4s, v10.4s, v11.4s // ......................................................|.*............................................................................ + // trn2 v10.2d, v25.2d, v27.2d // ......................................................|..*........................................................................... + // trn2 v11.2d, v26.2d, v28.2d // ......................................................|....*......................................................................... + // trn1 v8.2d, v25.2d, v27.2d // ......................................................|...*.......................................................................... + // trn1 v9.2d, v26.2d, v28.2d // ......................................................|.....*........................................................................ + // ldr q0, [x4], #(6*16) // ..............................................e.......|..........................................................................e... + // ldr q4, [x4, #(-6*16 + 1*16)] // ................................................e.....|............................................................................e. + // ldr q1, [x4, #(-6*16 + 2*16)] // ..................................................e...|.............................................................................. + // ldr q5, [x4, #(-6*16 + 3*16)] // ...................................................e..|.............................................................................. + // ldr q2, [x4, #(-6*16 + 4*16)] // ....................................................e.|.............................................................................. + // ldr q6, [x4, #(-6*16 + 5*16)] // .....................................................e|.............................................................................. + // sub v24.8h, v8.8h, v9.8h // ......................................................|........*..................................................................... + // add v8.8h, v8.8h, v9.8h // ......................................................|.........*.................................................................... + // mul v9.8h, v24.8h, v1.8h // ......................................................|...........*.................................................................. + // sqrdmulh v24.8h, v24.8h, v5.8h // ......................................................|............*................................................................. + // mls v9.8h, v24.8h, v7.h[0] // ......................................................|................*............................................................. + // sub v24.8h, v10.8h, v11.8h // ......................................................|......*....................................................................... + // add v10.8h, v10.8h, v11.8h // ......................................................|.......*...................................................................... + // mul v11.8h, v24.8h, v2.8h // ......................................................|..........*................................................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ......................................................|.............*................................................................ + // mls v11.8h, v24.8h, v7.h[0] // ......................................................|.................*............................................................ + // sub v24.8h, v8.8h, v10.8h // ......................................................|..............*............................................................... + // add v8.8h, v8.8h, v10.8h // ......................................................|...............*.............................................................. + // mul v10.8h, v24.8h, v0.8h // ......................................................|..................*........................................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ......................................................|...................*.......................................................... + // mls v10.8h, v24.8h, v7.h[0] // ......................................................|......................*....................................................... + // sub v24.8h, v9.8h, v11.8h // ......................................................|.....................*........................................................ + // add v9.8h, v9.8h, v11.8h // ......................................................|.......................*...................................................... + // mul v11.8h, v24.8h, v0.8h // ......................................................|........................*..................................................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ......................................................|.........................*.................................................... + // mls v11.8h, v24.8h, v7.h[0] // .*....................................................|.............................*................................................ + // trn1 v25.4s, v8.4s, v9.4s // ......................................................|..........................*................................................... + // trn2 v26.4s, v8.4s, v9.4s // ......................................................|...........................*.................................................. + // trn1 v27.4s, v10.4s, v11.4s // ....*.................................................|................................*............................................. + // trn2 v28.4s, v10.4s, v11.4s // .....*................................................|.................................*............................................ + // trn2 v10.2d, v25.2d, v27.2d // .......*..............................................|...................................*.......................................... + // trn2 v11.2d, v26.2d, v28.2d // ........*.............................................|....................................*......................................... + // trn1 v8.2d, v25.2d, v27.2d // .........*............................................|.....................................*........................................ + // trn1 v9.2d, v26.2d, v28.2d // ..........*...........................................|......................................*....................................... + // ldr q0, [x3], #16 // ......................................................|....................*......................................................... + // sub v24.8h, v8.8h, v9.8h // ............*.........................................|........................................*..................................... + // add v8.8h, v8.8h, v9.8h // .............*........................................|.........................................*.................................... + // mul v9.8h, v24.8h, v0.h[2] // ...............*......................................|...........................................*.................................. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................*.....................................|............................................*................................. + // mls v9.8h, v24.8h, v7.h[0] // ....................*.................................|................................................*............................. + // sub v24.8h, v10.8h, v11.8h // ...........*..........................................|.......................................*...................................... + // add v10.8h, v10.8h, v11.8h // ..................*...................................|..............................................*............................... + // mul v11.8h, v24.8h, v0.h[4] // ..............*.......................................|..........................................*................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .................*....................................|.............................................*................................ + // mls v11.8h, v24.8h, v7.h[0] // .....................*................................|.................................................*............................ + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...................*..................................|...............................................*.............................. + // srshr v25.8h, v25.8h, #11 // .......................*..............................|...................................................*.......................... + // mls v8.8h, v25.8h, v7.h[0] // ..........................*...........................|......................................................*....................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ......................*...............................|..................................................*........................... + // srshr v25.8h, v25.8h, #11 // ...........................*..........................|.......................................................*...................... + // mls v10.8h, v25.8h, v7.h[0] // ..............................*.......................|..........................................................*................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // ........................*.............................|....................................................*......................... + // srshr v25.8h, v25.8h, #11 // ............................*.........................|........................................................*..................... + // mls v9.8h, v25.8h, v7.h[0] // ...............................*......................|...........................................................*.................. + // sqdmulh v25.8h, v11.8h, v7.h[1] // .........................*............................|.....................................................*........................ + // srshr v25.8h, v25.8h, #11 // .............................*........................|.........................................................*.................... + // mls v11.8h, v25.8h, v7.h[0] // ................................*.....................|............................................................*................. + // sub v24.8h, v8.8h, v10.8h // ..................................*...................|..............................................................*............... + // add v8.8h, v8.8h, v10.8h // ...................................*..................|...............................................................*.............. + // mul v10.8h, v24.8h, v0.h[0] // .....................................*................|.................................................................*............ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................*...............|..................................................................*........... + // mls v10.8h, v24.8h, v7.h[0] // ..........................................*...........|......................................................................*....... + // sub v24.8h, v9.8h, v11.8h // ....................................*.................|................................................................*............. + // add v9.8h, v9.8h, v11.8h // .........................................*............|.....................................................................*........ + // mul v11.8h, v24.8h, v0.h[0] // .......................................*..............|...................................................................*.......... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................*.............|....................................................................*......... + // mls v11.8h, v24.8h, v7.h[0] // ............................................*.........|........................................................................*..... + // str q8, [x1], #(64) // ...........................................*..........|.......................................................................*...... + // str q9, [x1, #(-64 + 16*1)] // .............................................*........|.........................................................................*.... + // str q10, [x1, #(-64 + 16*2)] // ...............................................*......|...........................................................................*.. + // str q11, [x1, #(-64 + 16*3)] // .................................................*....|.............................................................................* + + sub count, count, #1 + cbnz count, layer4567_start + trn1 v22.4S, v23.4S, v27.4S // *....................................................................... + // gap // ........................................................................ + trn2 v0.4S, v23.4S, v27.4S // .*...................................................................... + // gap // ........................................................................ + trn2 v23.4S, v3.4S, v28.4S // ..*..................................................................... + // gap // ........................................................................ + trn2 v28.2D, v22.2D, v26.2D // ...*.................................................................... + // gap // ........................................................................ + trn1 v26.2D, v22.2D, v26.2D // ....*................................................................... + // gap // ........................................................................ + trn1 v22.2D, v0.2D, v23.2D // ......*................................................................. + // gap // ........................................................................ + trn2 v19.2D, v0.2D, v23.2D // .....*.................................................................. + // gap // ........................................................................ + sub v23.8H, v26.8H, v22.8H // .........*.............................................................. + // gap // ........................................................................ + add v3.8H, v28.8H, v19.8H // ........*............................................................... + // gap // ........................................................................ + sub v0.8H, v28.8H, v19.8H // .......*................................................................ + // gap // ........................................................................ + sqrdmulh v19.8H, v23.8H, v11.8H // .............*.......................................................... + // gap // ........................................................................ + mul v28.8H, v23.8H, v24.8H // ............*........................................................... + // gap // ........................................................................ + sqrdmulh v23.8H, v0.8H, v25.8H // ..............*......................................................... + // gap // ........................................................................ + mul v27.8H, v0.8H, v14.8H // ...........*............................................................ + // gap // ........................................................................ + add v24.8H, v26.8H, v22.8H // ..........*............................................................. + // gap // ........................................................................ + mls v28.8H, v19.8H, v7.H[0] // .................*...................................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + mls v27.8H, v23.8H, v7.H[0] // ..................*..................................................... + // gap // ........................................................................ + sub v19.8H, v24.8H, v3.8H // ...............*........................................................ + // gap // ........................................................................ + ldr q11, [x3], #16 // .....................*.................................................. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v23.8H, v28.8H, v27.8H // ......................*................................................. + // gap // ........................................................................ + sqrdmulh v22.8H, v19.8H, v20.8H // ....................*................................................... + // gap // ........................................................................ + mul v0.8H, v19.8H, v29.8H // ...................*.................................................... + // gap // ........................................................................ + sqrdmulh v19.8H, v23.8H, v20.8H // ..........................*............................................. + // gap // ........................................................................ + mul v23.8H, v23.8H, v29.8H // .........................*.............................................. + // gap // ........................................................................ + add v28.8H, v28.8H, v27.8H // ........................*............................................... + // gap // ........................................................................ + add v27.8H, v24.8H, v3.8H // ................*....................................................... + // gap // ........................................................................ + mls v0.8H, v22.8H, v7.H[0] // .......................*................................................ + // gap // ........................................................................ + mls v23.8H, v19.8H, v7.H[0] // .............................*.......................................... + // gap // ........................................................................ + trn2 v22.4S, v27.4S, v28.4S // ............................*........................................... + // gap // ........................................................................ + trn1 v27.4S, v27.4S, v28.4S // ...........................*............................................ + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v19.4S, v0.4S, v23.4S // ...............................*........................................ + // gap // ........................................................................ + trn1 v0.4S, v0.4S, v23.4S // ..............................*......................................... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + trn2 v23.2D, v22.2D, v19.2D // .................................*...................................... + // gap // ........................................................................ + trn2 v28.2D, v27.2D, v0.2D // ................................*....................................... + // gap // ........................................................................ + trn1 v22.2D, v22.2D, v19.2D // ...................................*.................................... + // gap // ........................................................................ + sub v19.8H, v28.8H, v23.8H // ....................................*................................... + // gap // ........................................................................ + trn1 v26.2D, v27.2D, v0.2D // ..................................*..................................... + // gap // ........................................................................ + add v0.8H, v28.8H, v23.8H // ...........................................*............................ + // gap // ........................................................................ + sub v27.8H, v26.8H, v22.8H // .....................................*.................................. + // gap // ........................................................................ + sqrdmulh v23.8H, v19.8H, v11.H[5] // ..........................................*............................. + // gap // ........................................................................ + mul v24.8H, v19.8H, v11.H[4] // .......................................*................................ + // gap // ........................................................................ + sqrdmulh v28.8H, v27.8H, v11.H[3] // .........................................*.............................. + // gap // ........................................................................ + mul v3.8H, v27.8H, v11.H[2] // ........................................*............................... + // gap // ........................................................................ + sqdmulh v19.8H, v0.8H, v7.H[1] // ...............................................*........................ + // gap // ........................................................................ + mls v24.8H, v23.8H, v7.H[0] // ..............................................*......................... + // gap // ........................................................................ + add v27.8H, v26.8H, v22.8H // ......................................*................................. + // gap // ........................................................................ + mls v3.8H, v28.8H, v7.H[0] // .............................................*.......................... + // gap // ........................................................................ + srshr v19.8H, v19.8H, #11 // ....................................................*................... + // gap // ........................................................................ + sqdmulh v22.8H, v27.8H, v7.H[1] // ............................................*........................... + // gap // ........................................................................ + sqdmulh v26.8H, v24.8H, v7.H[1] // ..................................................*..................... + // gap // ........................................................................ + sqdmulh v23.8H, v3.8H, v7.H[1] // .................................................*...................... + // gap // ........................................................................ + mls v0.8H, v19.8H, v7.H[0] // .......................................................*................ + // gap // ........................................................................ + srshr v22.8H, v22.8H, #11 // ................................................*....................... + // gap // ........................................................................ + srshr v19.8H, v26.8H, #11 // ......................................................*................. + // gap // ........................................................................ + srshr v23.8H, v23.8H, #11 // .....................................................*.................. + // gap // ........................................................................ + mls v27.8H, v22.8H, v7.H[0] // ...................................................*.................... + // gap // ........................................................................ + mls v24.8H, v19.8H, v7.H[0] // .........................................................*.............. + // gap // ........................................................................ + mls v3.8H, v23.8H, v7.H[0] // ........................................................*............... + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + sub v19.8H, v27.8H, v0.8H // ..........................................................*............. + // gap // ........................................................................ + add v27.8H, v27.8H, v0.8H // ...........................................................*............ + // gap // ........................................................................ + sub v22.8H, v3.8H, v24.8H // ............................................................*........... + // gap // ........................................................................ + mul v0.8H, v19.8H, v11.H[0] // .............................................................*.......... + // gap // ........................................................................ + sqrdmulh v28.8H, v19.8H, v11.H[1] // ..............................................................*......... + // gap // ........................................................................ + sqrdmulh v23.8H, v22.8H, v11.H[1] // ................................................................*....... + // gap // ........................................................................ + mul v22.8H, v22.8H, v11.H[0] // ...............................................................*........ + // gap // ........................................................................ + add v19.8H, v3.8H, v24.8H // .................................................................*...... + // gap // ........................................................................ + mls v0.8H, v28.8H, v7.H[0] // ..................................................................*..... + // gap // ........................................................................ + str q27, [x1], #(64) // ...................................................................*.... + // gap // ........................................................................ + mls v22.8H, v23.8H, v7.H[0] // ....................................................................*... + // gap // ........................................................................ + str q19, [x1, #-48] // .....................................................................*.. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q0, [x1, #-32] // ......................................................................*. + // gap // ........................................................................ + // gap // ........................................................................ + // gap // ........................................................................ + str q22, [x1, #-16] // .......................................................................* + // gap // ........................................................................ + + // original source code + // trn1 v19.4S, v23.4S, v27.4S // *....................................................................... + // trn2 v23.4S, v23.4S, v27.4S // .*...................................................................... + // trn2 v22.4S, v3.4S, v28.4S // ..*..................................................................... + // trn2 v28.2D, v19.2D, v26.2D // ...*.................................................................... + // trn1 v19.2D, v19.2D, v26.2D // ....*................................................................... + // trn2 v0.2D, v23.2D, v22.2D // ......*................................................................. + // trn1 v23.2D, v23.2D, v22.2D // .....*.................................................................. + // sub v22.8H, v28.8H, v0.8H // .........*.............................................................. + // add v28.8H, v28.8H, v0.8H // ........*............................................................... + // sub v0.8H, v19.8H, v23.8H // .......*................................................................ + // add v19.8H, v19.8H, v23.8H // ..............*......................................................... + // mul v23.8H, v22.8H, v14.8H // .............*.......................................................... + // mul v27.8H, v0.8H, v24.8H // ...........*............................................................ + // sqrdmulh v0.8H, v0.8H, v11.8H // ..........*............................................................. + // sqrdmulh v22.8H, v22.8H, v25.8H // ............*........................................................... + // sub v24.8H, v19.8H, v28.8H // .................*...................................................... + // add v19.8H, v19.8H, v28.8H // .........................*.............................................. + // mls v27.8H, v0.8H, v7.H[0] // ...............*........................................................ + // mls v23.8H, v22.8H, v7.H[0] // ................*....................................................... + // mul v22.8H, v24.8H, v29.8H // .....................*.................................................. + // sqrdmulh v28.8H, v24.8H, v20.8H // ....................*................................................... + // ldr q0, [x3], #16 // ..................*..................................................... + // sub v24.8H, v27.8H, v23.8H // ...................*.................................................... + // mls v22.8H, v28.8H, v7.H[0] // ..........................*............................................. + // add v23.8H, v27.8H, v23.8H // ........................*............................................... + // mul v28.8H, v24.8H, v29.8H // .......................*................................................ + // sqrdmulh v27.8H, v24.8H, v20.8H // ......................*................................................. + // trn1 v24.4S, v19.4S, v23.4S // .............................*.......................................... + // trn2 v19.4S, v19.4S, v23.4S // ............................*........................................... + // mls v28.8H, v27.8H, v7.H[0] // ...........................*............................................ + // trn1 v26.4S, v22.4S, v28.4S // ...............................*........................................ + // trn2 v22.4S, v22.4S, v28.4S // ..............................*......................................... + // trn2 v29.2D, v24.2D, v26.2D // .................................*...................................... + // trn2 v20.2D, v19.2D, v22.2D // ................................*....................................... + // trn1 v24.2D, v24.2D, v26.2D // ....................................*................................... + // trn1 v19.2D, v19.2D, v22.2D // ..................................*..................................... + // sub v22.8H, v29.8H, v20.8H // ...................................*.................................... + // sub v26.8H, v24.8H, v19.8H // ......................................*................................. + // add v19.8H, v24.8H, v19.8H // .............................................*.......................... + // mul v24.8H, v22.8H, v0.H[4] // ........................................*............................... + // mul v11.8H, v26.8H, v0.H[2] // ..........................................*............................. + // sqrdmulh v26.8H, v26.8H, v0.H[3] // .........................................*.............................. + // sqrdmulh v22.8H, v22.8H, v0.H[5] // .......................................*................................ + // add v29.8H, v29.8H, v20.8H // .....................................*.................................. + // sqdmulh v20.8H, v19.8H, v7.H[1] // ................................................*....................... + // mls v11.8H, v26.8H, v7.H[0] // ..............................................*......................... + // mls v24.8H, v22.8H, v7.H[0] // ............................................*........................... + // sqdmulh v22.8H, v29.8H, v7.H[1] // ...........................................*............................ + // srshr v26.8H, v20.8H, #11 // ....................................................*................... + // sqdmulh v20.8H, v11.8H, v7.H[1] // ..................................................*..................... + // sqdmulh v14.8H, v24.8H, v7.H[1] // .................................................*...................... + // mls v19.8H, v26.8H, v7.H[0] // .......................................................*................ + // srshr v22.8H, v22.8H, #11 // ...............................................*........................ + // srshr v26.8H, v20.8H, #11 // ......................................................*................. + // srshr v20.8H, v14.8H, #11 // .....................................................*.................. + // mls v29.8H, v22.8H, v7.H[0] // ...................................................*.................... + // mls v11.8H, v26.8H, v7.H[0] // .........................................................*.............. + // mls v24.8H, v20.8H, v7.H[0] // ........................................................*............... + // sub v22.8H, v19.8H, v29.8H // ..........................................................*............. + // add v19.8H, v19.8H, v29.8H // ...........................................................*............ + // sub v29.8H, v11.8H, v24.8H // ............................................................*........... + // mul v20.8H, v22.8H, v0.H[0] // .............................................................*.......... + // sqrdmulh v22.8H, v22.8H, v0.H[1] // ..............................................................*......... + // mul v14.8H, v29.8H, v0.H[0] // ................................................................*....... + // sqrdmulh v0.8H, v29.8H, v0.H[1] // ...............................................................*........ + // add v24.8H, v11.8H, v24.8H // .................................................................*...... + // mls v20.8H, v22.8H, v7.H[0] // ..................................................................*..... + // str q19, [x1], #(64) // ...................................................................*.... + // mls v14.8H, v0.8H, v7.H[0] // ....................................................................*... + // str q24, [x1, #-48] // .....................................................................*.. + // str q20, [x1, #-32] // ......................................................................*. + // str q14, [x1, #-16] // .......................................................................* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q24, [x0, #64] // *...... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q22, [x0, #128] // .*..... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q28, [x0, #192] // ..*.... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q3, [x0, #256] // ...*... + // gap // ....... + // gap // ....... + // gap // ....... + ldr q26, [x0, #320] // ....*.. + // gap // ....... + // gap // ....... + // gap // ....... + ldr q11, [x0, #384] // .....*. + // gap // ....... + // gap // ....... + // gap // ....... + ldr q14, [x0, #448] // ......* + // gap // ....... + + // original source code + // ldr q24, [x0, #64] // *...... + // ldr q22, [x0, #128] // .*..... + // ldr q28, [x0, #192] // ..*.... + // ldr q3, [x0, #256] // ...*... + // ldr q26, [x0, #320] // ....*.. + // ldr q11, [x0, #384] // .....*. + // ldr q14, [x0, #448] // ......* + + sub count, count, #1 +layer123_start: + ldr q23, [x0, #0] // *....................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v19.8H, v22.8H, v28.8H // .............*.......................................................................... + // gap // ........................................................................................ + add v22.8H, v22.8H, v28.8H // ..............*......................................................................... + // gap // ........................................................................................ + sub v28.8H, v23.8H, v24.8H // ........*............................................................................... + // gap // ........................................................................................ + add v23.8H, v23.8H, v24.8H // .........*.............................................................................. + // gap // ........................................................................................ + mul v27.8H, v19.8H, v1.H[0] // ...............*........................................................................ + // gap // ........................................................................................ + sqrdmulh v8.8H, v19.8H, v1.H[1] // ................*....................................................................... + // gap // ........................................................................................ + sub v24.8H, v23.8H, v22.8H // ............................*........................................................... + // gap // ........................................................................................ + add v21.8H, v23.8H, v22.8H // .............................*.......................................................... + // gap // ........................................................................................ + mul v22.8H, v28.8H, v0.H[6] // ..........*............................................................................. + // gap // ........................................................................................ + sqrdmulh v28.8H, v28.8H, v0.H[7] // ...........*............................................................................ + // gap // ........................................................................................ + mls v27.8H, v8.8H, v7.H[0] // .................*...................................................................... + // gap // ........................................................................................ + sub v19.8H, v3.8H, v26.8H // ..................*..................................................................... + // gap // ........................................................................................ + add v3.8H, v3.8H, v26.8H // ...................*.................................................................... + // gap // ........................................................................................ + mls v22.8H, v28.8H, v7.H[0] // ............*........................................................................... + // gap // ........................................................................................ + mul v28.8H, v19.8H, v1.H[2] // ....................*................................................................... + // gap // ........................................................................................ + add v25.8H, v11.8H, v14.8H // ........................*............................................................... + // gap // ........................................................................................ + mul v26.8H, v24.8H, v0.H[2] // ..............................*......................................................... + // gap // ........................................................................................ + sqrdmulh v24.8H, v24.8H, v0.H[3] // ...............................*........................................................ + // gap // ........................................................................................ + add v20.8H, v3.8H, v25.8H // .......................................*................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v19.8H, v1.H[3] // .....................*.................................................................. + // gap // ........................................................................................ + sub v11.8H, v11.8H, v14.8H // .......................*................................................................ + // gap // ........................................................................................ + add v23.8H, v21.8H, v20.8H // .................................................*...................................... + // gap // ........................................................................................ + sub v14.8H, v21.8H, v20.8H // ................................................*....................................... + // gap // ........................................................................................ + mls v28.8H, v19.8H, v7.H[0] // ......................*................................................................. + // gap // ........................................................................................ + sub v19.8H, v22.8H, v27.8H // .................................*...................................................... + // gap // ........................................................................................ + add v22.8H, v22.8H, v27.8H // ..................................*..................................................... + // gap // ........................................................................................ + mul v27.8H, v11.8H, v1.H[4] // .........................*.............................................................. + // gap // ........................................................................................ + sqrdmulh v20.8H, v11.8H, v1.H[5] // ..........................*............................................................. + // gap // ........................................................................................ + sub v3.8H, v3.8H, v25.8H // ......................................*................................................. + // gap // ........................................................................................ + mls v26.8H, v24.8H, v7.H[0] // ................................*....................................................... + // gap // ........................................................................................ + mul v24.8H, v19.8H, v0.H[2] // ...................................*.................................................... + // gap // ........................................................................................ + mls v27.8H, v20.8H, v7.H[0] // ...........................*............................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v19.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + mul v20.8H, v3.8H, v0.H[4] // ........................................*............................................... + // gap // ........................................................................................ + sqrdmulh v3.8H, v3.8H, v0.H[5] // .........................................*.............................................. + // gap // ........................................................................................ + sub v11.8H, v28.8H, v27.8H // ...........................................*............................................ + // gap // ........................................................................................ + mls v24.8H, v19.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + add v19.8H, v28.8H, v27.8H // ............................................*........................................... + // gap // ........................................................................................ + mul v28.8H, v11.8H, v0.H[4] // .............................................*.......................................... + // gap // ........................................................................................ + sqrdmulh v27.8H, v11.8H, v0.H[5] // ..............................................*......................................... + // gap // ........................................................................................ + mls v20.8H, v3.8H, v7.H[0] // ..........................................*............................................. + // gap // ........................................................................................ + mul v3.8H, v14.8H, v0.H[0] // ..................................................*..................................... + // gap // ........................................................................................ + sqrdmulh v25.8H, v14.8H, v0.H[1] // ...................................................*.................................... + // gap // ........................................................................................ + add v8.8H, v22.8H, v19.8H // ......................................................*................................. + // gap // ........................................................................................ + sub v14.8H, v22.8H, v19.8H // .....................................................*.................................. + // gap // ........................................................................................ + mls v28.8H, v27.8H, v7.H[0] // ...............................................*........................................ + // gap // ........................................................................................ + mul v22.8H, v23.8H, v29.8H // ........................................................................*............... + // gap // ........................................................................................ + sqrdmulh v11.8H, v14.8H, v0.H[1] // ........................................................*............................... + // gap // ........................................................................................ + sqrdmulh v23.8H, v23.8H, v30.8H // .........................................................................*.............. + // gap // ........................................................................................ + mls v3.8H, v25.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + mul v27.8H, v14.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + sub v14.8H, v26.8H, v20.8H // ..........................................................*............................. + // gap // ........................................................................................ + add v26.8H, v26.8H, v20.8H // ...........................................................*............................ + // gap // ........................................................................................ + sub v20.8H, v24.8H, v28.8H // ...............................................................*........................ + // gap // ........................................................................................ + mls v27.8H, v11.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + mul v11.8H, v14.8H, v0.H[0] // ............................................................*........................... + // gap // ........................................................................................ + sqrdmulh v14.8H, v14.8H, v0.H[1] // .............................................................*.......................... + // gap // ........................................................................................ + add v28.8H, v24.8H, v28.8H // ................................................................*....................... + // gap // ........................................................................................ + str q27, [x0, #320] // .....................................................................*.................. + // gap // ........................................................................................ + mul v24.8H, v20.8H, v0.H[0] // .................................................................*...................... + // gap // ........................................................................................ + sqrdmulh v20.8H, v20.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + str q3, [x0, #256] // ....................................................................*................... + // gap // ........................................................................................ + mls v11.8H, v14.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + mls v22.8H, v23.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + mls v24.8H, v20.8H, v7.H[0] // ...................................................................*.................... + // gap // ........................................................................................ + mul v23.8H, v8.8H, v29.8H // ...........................................................................*............ + // gap // ........................................................................................ + str q11, [x0, #384] // ......................................................................*................. + // gap // ........................................................................................ + sqrdmulh v19.8H, v8.8H, v30.8H // ............................................................................*........... + // gap // ........................................................................................ + str q24, [x0, #448] // .......................................................................*................ + // gap // ........................................................................................ + mul v27.8H, v26.8H, v29.8H // ..............................................................................*......... + // gap // ........................................................................................ + str q22, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + sqrdmulh v24.8H, v26.8H, v30.8H // ...............................................................................*........ + // gap // ........................................................................................ + mls v23.8H, v19.8H, v7.H[0] // .............................................................................*.......... + // gap // ........................................................................................ + mul v19.8H, v28.8H, v29.8H // .................................................................................*...... + // gap // ........................................................................................ + sqrdmulh v8.8H, v28.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + mls v27.8H, v24.8H, v7.H[0] // ................................................................................*....... + // gap // ........................................................................................ + ldr q24, [x0, #64] // .e...................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.8H, v8.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + str q23, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + ldr q22, [x0, #128] // ..e..................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q27, [x0, #112] // ......................................................................................*. + // gap // ........................................................................................ + ldr q28, [x0, #192] // ...e.................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q19, [x0, #176] // .......................................................................................* + // gap // ........................................................................................ + ldr q3, [x0, #256] // ....e................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q26, [x0, #320] // .....e.................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q11, [x0, #384] // ......e................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q14, [x0, #448] // .......e................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + + // original source code + // ldr q8, [x0, #0] // ...........*................................................................................... + // ldr q9, [x0, #(1*(512/8))] // e..........|............................................................................e...... + // ldr q10, [x0, #(2*(512/8))] // ...e.......|...............................................................................e... + // ldr q11, [x0, #(3*(512/8))] // .....e.....|.................................................................................e. + // ldr q12, [x0, #(4*(512/8))] // .......e...|................................................................................... + // ldr q13, [x0, #(5*(512/8))] // ........e..|................................................................................... + // ldr q14, [x0, #(6*(512/8))] // .........e.|................................................................................... + // ldr q15, [x0, #(7*(512/8))] // ..........e|................................................................................... + // sub v24.8h, v8.8h, v9.8h // ...........|..*................................................................................ + // add v8.8h, v8.8h, v9.8h // ...........|...*............................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ...........|........*.......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ...........|.........*......................................................................... + // mls v9.8h, v24.8h, v7.h[0] // ...........|.............*..................................................................... + // sub v24.8h, v10.8h, v11.8h // ...........|*.................................................................................. + // add v10.8h, v10.8h, v11.8h // ...........|.*................................................................................. + // mul v11.8h, v24.8h, v1.h[0] // ...........|....*.............................................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...........|.....*............................................................................. + // mls v11.8h, v24.8h, v7.h[0] // ...........|..........*........................................................................ + // sub v24.8h, v12.8h, v13.8h // ...........|...........*....................................................................... + // add v12.8h, v12.8h, v13.8h // ...........|............*...................................................................... + // mul v13.8h, v24.8h, v1.h[2] // ...........|..............*.................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........|...................*............................................................... + // mls v13.8h, v24.8h, v7.h[0] // ...........|.......................*........................................................... + // sub v24.8h, v14.8h, v15.8h // ...........|....................*.............................................................. + // add v14.8h, v14.8h, v15.8h // ...........|...............*................................................................... + // mul v15.8h, v24.8h, v1.h[4] // ...........|..........................*........................................................ + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ...........|...........................*....................................................... + // mls v15.8h, v24.8h, v7.h[0] // ...........|...............................*................................................... + // sub v24.8h, v8.8h, v10.8h // ...........|......*............................................................................ + // add v8.8h, v8.8h, v10.8h // ...........|.......*........................................................................... + // mul v10.8h, v24.8h, v0.h[2] // ...........|................*.................................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|.................*................................................................. + // mls v10.8h, v24.8h, v7.h[0] // ...........|.............................*..................................................... + // sub v24.8h, v9.8h, v11.8h // ...........|........................*.......................................................... + // add v9.8h, v9.8h, v11.8h // ...........|.........................*......................................................... + // mul v11.8h, v24.8h, v0.h[2] // ...........|..............................*.................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...........|................................*.................................................. + // mls v11.8h, v24.8h, v7.h[0] // ...........|....................................*.............................................. + // sub v24.8h, v12.8h, v14.8h // ...........|............................*...................................................... + // add v12.8h, v12.8h, v14.8h // ...........|..................*................................................................ + // mul v14.8h, v24.8h, v0.h[4] // ...........|.................................*................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|..................................*................................................ + // mls v14.8h, v24.8h, v7.h[0] // ...........|........................................*.......................................... + // sub v24.8h, v13.8h, v15.8h // ...........|...................................*............................................... + // add v13.8h, v13.8h, v15.8h // ...........|.....................................*............................................. + // mul v15.8h, v24.8h, v0.h[4] // ...........|......................................*............................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...........|.......................................*........................................... + // mls v15.8h, v24.8h, v7.h[0] // ...........|.............................................*..................................... + // sub v24.8h, v8.8h, v12.8h // ...........|......................*............................................................ + // add v8.8h, v8.8h, v12.8h // ...........|.....................*............................................................. + // mul v12.8h, v24.8h, v0.h[0] // ...........|.........................................*......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|..........................................*........................................ + // mls v12.8h, v24.8h, v7.h[0] // ...........|.................................................*................................. + // sub v24.8h, v9.8h, v13.8h // ...........|............................................*...................................... + // add v9.8h, v9.8h, v13.8h // ...........|...........................................*....................................... + // mul v13.8h, v24.8h, v0.h[0] // ...........|..................................................*................................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|...............................................*................................... + // mls v13.8h, v24.8h, v7.h[0] // ...........|......................................................*............................ + // sub v24.8h, v10.8h, v14.8h // ...........|...................................................*............................... + // add v10.8h, v10.8h, v14.8h // ...........|....................................................*.............................. + // mul v14.8h, v24.8h, v0.h[0] // ...........|.......................................................*........................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|........................................................*.......................... + // mls v14.8h, v24.8h, v7.h[0] // ...........|..............................................................*.................... + // sub v24.8h, v11.8h, v15.8h // ...........|.....................................................*............................. + // add v11.8h, v11.8h, v15.8h // ...........|.........................................................*......................... + // mul v15.8h, v24.8h, v0.h[0] // ...........|...........................................................*....................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...........|............................................................*...................... + // mls v15.8h, v24.8h, v7.h[0] // ...........|................................................................*.................. + // str q12, [x0, #(4*(512/8))] // ...........|.............................................................*..................... + // str q13, [x0, #(5*(512/8))] // ...........|..........................................................*........................ + // str q14, [x0, #(6*(512/8))] // ...........|..................................................................*................ + // str q15, [x0, #(7*(512/8))] // ...........|....................................................................*.............. + // mul v12.8h, v8.8h, v29.8h // ...........|..............................................*.................................... + // sqrdmulh v8.8h, v8.8h, v30.8h // ...........|................................................*.................................. + // mls v12.8h, v8.8h, v7.h[0] // ...........|...............................................................*................... + // mul v13.8h, v9.8h, v29.8h // ...........|.................................................................*................. + // sqrdmulh v9.8h, v9.8h, v30.8h // ...........|...................................................................*............... + // mls v13.8h, v9.8h, v7.h[0] // ...........|........................................................................*.......... + // mul v14.8h, v10.8h, v29.8h // ...........|.....................................................................*............. + // sqrdmulh v10.8h, v10.8h, v30.8h // ...........|.......................................................................*........... + // mls v14.8h, v10.8h, v7.h[0] // ...........|...........................................................................*....... + // mul v15.8h, v11.8h, v29.8h // ...........|.........................................................................*......... + // sqrdmulh v11.8h, v11.8h, v30.8h // ...........|..........................................................................*........ + // mls v15.8h, v11.8h, v7.h[0] // .*.........|.............................................................................*..... + // str q12, [x0], #(16) // ...........|......................................................................*............ + // str q13, [x0, #(-16 + 1*(512/8))] // ..*........|..............................................................................*.... + // str q14, [x0, #(-16 + 2*(512/8))] // ....*......|................................................................................*.. + // str q15, [x0, #(-16 + 3*(512/8))] // ......*....|..................................................................................* + + sub count, count, #1 + cbnz count, layer123_start + ldr q4, [x0, #0] // *................................................................................ + // gap // ................................................................................. + // gap // ................................................................................. + // gap // ................................................................................. + sub v9.8H, v22.8H, v28.8H // .*............................................................................... + // gap // ................................................................................. + sub v5.8H, v3.8H, v26.8H // ............*.................................................................... + // gap // ................................................................................. + sub v6.8H, v4.8H, v24.8H // ...*............................................................................. + // gap // ................................................................................. + sqrdmulh v15.8H, v9.8H, v1.H[1] // ......*.......................................................................... + // gap // ................................................................................. + sub v20.8H, v11.8H, v14.8H // .....................*........................................................... + // gap // ................................................................................. + mul v19.8H, v6.8H, v0.H[6] // .........*....................................................................... + // gap // ................................................................................. + mul v23.8H, v5.8H, v1.H[2] // ...............*................................................................. + // gap // ................................................................................. + mul v27.8H, v20.8H, v1.H[4] // ...........................*..................................................... + // gap // ................................................................................. + sqrdmulh v21.8H, v20.8H, v1.H[5] // ............................*.................................................... + // gap // ................................................................................. + sqrdmulh v10.8H, v5.8H, v1.H[3] // ....................*............................................................ + // gap // ................................................................................. + mul v8.8H, v9.8H, v1.H[0] // .....*........................................................................... + // gap // ................................................................................. + sqrdmulh v9.8H, v6.8H, v0.H[7] // ..........*...................................................................... + // gap // ................................................................................. + mls v27.8H, v21.8H, v7.H[0] // ................................*................................................ + // gap // ................................................................................. + mls v23.8H, v10.8H, v7.H[0] // ........................*........................................................ + // gap // ................................................................................. + mls v8.8H, v15.8H, v7.H[0] // ...........*..................................................................... + // gap // ................................................................................. + mls v19.8H, v9.8H, v7.H[0] // ..............*.................................................................. + // gap // ................................................................................. + add v16.8H, v11.8H, v14.8H // ................*................................................................ + // gap // ................................................................................. + sub v20.8H, v23.8H, v27.8H // ....................................*............................................ + // gap // ................................................................................. + add v14.8H, v23.8H, v27.8H // ......................................*.......................................... + // gap // ................................................................................. + add v5.8H, v19.8H, v8.8H // ..........................*...................................................... + // gap // ................................................................................. + sub v15.8H, v19.8H, v8.8H // .........................*....................................................... + // gap // ................................................................................. + add v10.8H, v3.8H, v26.8H // .............*................................................................... + // gap // ................................................................................. + sub v27.8H, v5.8H, v14.8H // .............................................*................................... + // gap // ................................................................................. + add v9.8H, v22.8H, v28.8H // ..*.............................................................................. + // gap // ................................................................................. + add v4.8H, v4.8H, v24.8H // ....*............................................................................ + // gap // ................................................................................. + sqrdmulh v19.8H, v27.8H, v0.H[1] // ................................................*................................ + // gap // ................................................................................. + mul v23.8H, v27.8H, v0.H[0] // ...................................................*............................. + // gap // ................................................................................. + sqrdmulh v18.8H, v15.8H, v0.H[3] // .................................*............................................... + // gap // ................................................................................. + add v3.8H, v10.8H, v16.8H // ...................*............................................................. + // gap // ................................................................................. + add v24.8H, v4.8H, v9.8H // ........*........................................................................ + // gap // ................................................................................. + mls v23.8H, v19.8H, v7.H[0] // .......................................................*......................... + // gap // ................................................................................. + sqrdmulh v26.8H, v20.8H, v0.H[5] // ........................................*........................................ + // gap // ................................................................................. + sub v22.8H, v24.8H, v3.8H // .......................*......................................................... + // gap // ................................................................................. + mul v11.8H, v20.8H, v0.H[4] // .......................................*......................................... + // gap // ................................................................................. + str q23, [x0, #320] // ...........................................................*..................... + // gap // ................................................................................. + sqrdmulh v19.8H, v22.8H, v0.H[1] // ...........................................*..................................... + // gap // ................................................................................. + mul v25.8H, v22.8H, v0.H[0] // ..........................................*...................................... + // gap // ................................................................................. + mls v11.8H, v26.8H, v7.H[0] // ..............................................*.................................. + // gap // ................................................................................. + mul v20.8H, v15.8H, v0.H[2] // ...............................*................................................. + // gap // ................................................................................. + sub v12.8H, v10.8H, v16.8H // .............................*................................................... + // gap // ................................................................................. + mls v25.8H, v19.8H, v7.H[0] // ..................................................*.............................. + // gap // ................................................................................. + add v23.8H, v5.8H, v14.8H // ............................................*.................................... + // gap // ................................................................................. + mls v20.8H, v18.8H, v7.H[0] // .....................................*........................................... + // gap // ................................................................................. + mul v5.8H, v12.8H, v0.H[4] // ..................................*.............................................. + // gap // ................................................................................. + str q25, [x0, #256] // ..............................................................*.................. + // gap // ................................................................................. + mul v28.8H, v23.8H, v29.8H // ..................................................................*.............. + // gap // ................................................................................. + add v27.8H, v20.8H, v11.8H // ..........................................................*...................... + // gap // ................................................................................. + sqrdmulh v19.8H, v23.8H, v30.8H // ....................................................................*............ + // gap // ................................................................................. + sqrdmulh v22.8H, v12.8H, v0.H[5] // ...................................*............................................. + // gap // ................................................................................. + mul v23.8H, v27.8H, v29.8H // ..........................................................................*...... + // gap // ................................................................................. + sqrdmulh v14.8H, v27.8H, v30.8H // ...........................................................................*..... + // gap // ................................................................................. + mls v28.8H, v19.8H, v7.H[0] // .........................................................................*....... + // gap // ................................................................................. + mls v5.8H, v22.8H, v7.H[0] // .........................................*....................................... + // gap // ................................................................................. + sub v21.8H, v4.8H, v9.8H // .......*......................................................................... + // gap // ................................................................................. + mls v23.8H, v14.8H, v7.H[0] // .............................................................................*... + // gap // ................................................................................. + str q28, [x0, #64] // ..............................................................................*.. + // gap // ................................................................................. + mul v8.8H, v21.8H, v0.H[2] // .................*............................................................... + // gap // ................................................................................. + sqrdmulh v19.8H, v21.8H, v0.H[3] // ..................*.............................................................. + // gap // ................................................................................. + str q23, [x0, #192] // ................................................................................* + // gap // ................................................................................. + sub v27.8H, v20.8H, v11.8H // ......................................................*.......................... + // gap // ................................................................................. + add v25.8H, v24.8H, v3.8H // ......................*.......................................................... + // gap // ................................................................................. + mls v8.8H, v19.8H, v7.H[0] // ..............................*.................................................. + // gap // ................................................................................. + mul v3.8H, v27.8H, v0.H[0] // ............................................................*.................... + // gap // ................................................................................. + sqrdmulh v19.8H, v27.8H, v0.H[1] // .............................................................*................... + // gap // ................................................................................. + sqrdmulh v27.8H, v25.8H, v30.8H // .................................................*............................... + // gap // ................................................................................. + add v23.8H, v8.8H, v5.8H // .....................................................*........................... + // gap // ................................................................................. + sub v26.8H, v8.8H, v5.8H // ....................................................*............................ + // gap // ................................................................................. + mls v3.8H, v19.8H, v7.H[0] // .................................................................*............... + // gap // ................................................................................. + sqrdmulh v19.8H, v23.8H, v30.8H // ........................................................................*........ + // gap // ................................................................................. + mul v22.8H, v23.8H, v29.8H // ......................................................................*.......... + // gap // ................................................................................. + sqrdmulh v28.8H, v26.8H, v0.H[1] // .........................................................*....................... + // gap // ................................................................................. + mul v24.8H, v26.8H, v0.H[0] // ........................................................*........................ + // gap // ................................................................................. + mul v23.8H, v25.8H, v29.8H // ...............................................*................................. + // gap // ................................................................................. + mls v22.8H, v19.8H, v7.H[0] // ............................................................................*.... + // gap // ................................................................................. + str q3, [x0, #448] // .....................................................................*........... + // gap // ................................................................................. + mls v24.8H, v28.8H, v7.H[0] // ...............................................................*................. + // gap // ................................................................................. + mls v23.8H, v27.8H, v7.H[0] // ................................................................*................ + // gap // ................................................................................. + str q22, [x0, #128] // ...............................................................................*. + // gap // ................................................................................. + // gap // ................................................................................. + // gap // ................................................................................. + str q24, [x0, #384] // ...................................................................*............. + // gap // ................................................................................. + // gap // ................................................................................. + // gap // ................................................................................. + str q23, [x0], #(16) // .......................................................................*......... + // gap // ................................................................................. + + // original source code + // ldr q23, [x0, #0] // *................................................................................ + // sub v19.8H, v22.8H, v28.8H // .*............................................................................... + // add v22.8H, v22.8H, v28.8H // ........................*........................................................ + // sub v28.8H, v23.8H, v24.8H // ...*............................................................................. + // add v23.8H, v23.8H, v24.8H // .........................*....................................................... + // mul v27.8H, v19.8H, v1.H[0] // ...........*..................................................................... + // sqrdmulh v8.8H, v19.8H, v1.H[1] // ....*............................................................................ + // sub v24.8H, v23.8H, v22.8H // ......................................................*.......................... + // add v21.8H, v23.8H, v22.8H // ..............................*.................................................. + // mul v22.8H, v28.8H, v0.H[6] // ......*.......................................................................... + // sqrdmulh v28.8H, v28.8H, v0.H[7] // ............*.................................................................... + // mls v27.8H, v8.8H, v7.H[0] // ...............*................................................................. + // sub v19.8H, v3.8H, v26.8H // ..*.............................................................................. + // add v3.8H, v3.8H, v26.8H // ......................*.......................................................... + // mls v22.8H, v28.8H, v7.H[0] // ................*................................................................ + // mul v28.8H, v19.8H, v1.H[2] // .......*......................................................................... + // add v25.8H, v11.8H, v14.8H // .................*............................................................... + // mul v26.8H, v24.8H, v0.H[2] // .........................................................*....................... + // sqrdmulh v24.8H, v24.8H, v0.H[3] // ..........................................................*...................... + // add v20.8H, v3.8H, v25.8H // .............................*................................................... + // sqrdmulh v19.8H, v19.8H, v1.H[3] // ..........*...................................................................... + // sub v11.8H, v11.8H, v14.8H // .....*........................................................................... + // add v23.8H, v21.8H, v20.8H // .............................................................*................... + // sub v14.8H, v21.8H, v20.8H // .................................*............................................... + // mls v28.8H, v19.8H, v7.H[0] // ..............*.................................................................. + // sub v19.8H, v22.8H, v27.8H // .....................*........................................................... + // add v22.8H, v22.8H, v27.8H // ....................*............................................................ + // mul v27.8H, v11.8H, v1.H[4] // ........*........................................................................ + // sqrdmulh v20.8H, v11.8H, v1.H[5] // .........*....................................................................... + // sub v3.8H, v3.8H, v25.8H // ........................................*........................................ + // mls v26.8H, v24.8H, v7.H[0] // ..............................................................*.................. + // mul v24.8H, v19.8H, v0.H[2] // .......................................*......................................... + // mls v27.8H, v20.8H, v7.H[0] // .............*................................................................... + // sqrdmulh v19.8H, v19.8H, v0.H[3] // ............................*.................................................... + // mul v20.8H, v3.8H, v0.H[4] // ............................................*.................................... + // sqrdmulh v3.8H, v3.8H, v0.H[5] // .................................................*............................... + // sub v11.8H, v28.8H, v27.8H // ..................*.............................................................. + // mls v24.8H, v19.8H, v7.H[0] // ...........................................*..................................... + // add v19.8H, v28.8H, v27.8H // ...................*............................................................. + // mul v28.8H, v11.8H, v0.H[4] // ..................................*.............................................. + // sqrdmulh v27.8H, v11.8H, v0.H[5] // ................................*................................................ + // mls v20.8H, v3.8H, v7.H[0] // .....................................................*........................... + // mul v3.8H, v14.8H, v0.H[0] // .....................................*........................................... + // sqrdmulh v25.8H, v14.8H, v0.H[1] // ....................................*............................................ + // add v8.8H, v22.8H, v19.8H // ..........................................*...................................... + // sub v14.8H, v22.8H, v19.8H // .......................*......................................................... + // mls v28.8H, v27.8H, v7.H[0] // ......................................*.......................................... + // mul v22.8H, v23.8H, v29.8H // .........................................................................*....... + // sqrdmulh v11.8H, v14.8H, v0.H[1] // ..........................*...................................................... + // sqrdmulh v23.8H, v23.8H, v30.8H // .................................................................*............... + // mls v3.8H, v25.8H, v7.H[0] // .........................................*....................................... + // mul v27.8H, v14.8H, v0.H[0] // ...........................*..................................................... + // sub v14.8H, v26.8H, v20.8H // ...................................................................*............. + // add v26.8H, v26.8H, v20.8H // ..................................................................*.............. + // sub v20.8H, v24.8H, v28.8H // ............................................................*.................... + // mls v27.8H, v11.8H, v7.H[0] // ...............................*................................................. + // mul v11.8H, v14.8H, v0.H[0] // ........................................................................*........ + // sqrdmulh v14.8H, v14.8H, v0.H[1] // .......................................................................*......... + // add v28.8H, v24.8H, v28.8H // ...............................................*................................. + // str q27, [x0, #320] // ...................................*............................................. + // mul v24.8H, v20.8H, v0.H[0] // ...............................................................*................. + // sqrdmulh v20.8H, v20.8H, v0.H[1] // ................................................................*................ + // str q3, [x0, #256] // .............................................*................................... + // mls v11.8H, v14.8H, v7.H[0] // ............................................................................*.... + // mls v22.8H, v23.8H, v7.H[0] // .............................................................................*... + // mls v24.8H, v20.8H, v7.H[0] // ....................................................................*............ + // mul v23.8H, v8.8H, v29.8H // ..............................................*.................................. + // str q11, [x0, #384] // ...............................................................................*. + // sqrdmulh v19.8H, v8.8H, v30.8H // ................................................*................................ + // str q24, [x0, #448] // ...........................................................................*..... + // mul v27.8H, v26.8H, v29.8H // ......................................................................*.......... + // str q22, [x0], #(16) // ................................................................................* + // sqrdmulh v24.8H, v26.8H, v30.8H // .....................................................................*........... + // mls v23.8H, v19.8H, v7.H[0] // ....................................................*............................ + // mul v19.8H, v28.8H, v29.8H // ..................................................*.............................. + // sqrdmulh v8.8H, v28.8H, v30.8H // ...................................................*............................. + // mls v27.8H, v24.8H, v7.H[0] // ..........................................................................*...... + // mls v19.8H, v8.8H, v7.H[0] // .......................................................*......................... + // str q23, [x0, #48] // ........................................................*........................ + // str q27, [x0, #112] // ..............................................................................*.. + // str q19, [x0, #176] // ...........................................................*..................... + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_kyber_123_4567_opt_a72.s b/examples/opt/aarch64/intt_kyber_123_4567_opt_a72.s new file mode 100644 index 00000000..4e82bc77 --- /dev/null +++ b/examples/opt/aarch64/intt_kyber_123_4567_opt_a72.s @@ -0,0 +1,1839 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_opt_a72 + .global _intt_kyber_123_4567_opt_a72 + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_opt_a72: +_intt_kyber_123_4567_opt_a72: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + // gap // ........................................................ + ldr q13, [x1, #48] // .*...................................................... + ldr q14, [x1, #32] // *....................................................... + ldr q25, [x1, #0] // ...*.................................................... + ldr q1, [x1, #16] // ..*..................................................... + // gap // ........................................................ + ldr q4, [x4, #80] // .......................*................................ + ldr q6, [x4], #(6*16) // ....*................................................... + // gap // ........................................................ + ldr q11, [x4, #-32] // ...............*........................................ + ldr q10, [x4, #-64] // ............*........................................... + // gap // ........................................................ + trn2 v28.4S, v14.4S, v13.4S // ......*................................................. + trn1 v23.4S, v14.4S, v13.4S // ........*............................................... + ldr q9, [x4, #-48] // ..............*......................................... + trn2 v16.4S, v25.4S, v1.4S // .......*................................................ + trn1 v5.4S, v25.4S, v1.4S // .........*.............................................. + ldr q3, [x4, #-80] // .....*.................................................. + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + trn1 v14.2D, v5.2D, v23.2D // .............*.......................................... + trn1 v12.2D, v16.2D, v28.2D // ..........*............................................. + // gap // ........................................................ + trn2 v19.2D, v16.2D, v28.2D // ...........*............................................ + // gap // ........................................................ + // gap // ........................................................ + trn2 v22.2D, v5.2D, v23.2D // ................*....................................... + // gap // ........................................................ + // gap // ........................................................ + sub v26.8H, v14.8H, v12.8H // .................*...................................... + add v13.8H, v14.8H, v12.8H // .....................*.................................. + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sub v27.8H, v22.8H, v19.8H // ..................*..................................... + // gap // ........................................................ + // gap // ........................................................ + add v29.8H, v22.8H, v19.8H // ...................*.................................... + sqrdmulh v5.8H, v26.8H, v9.8H // ......................*................................. + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sqrdmulh v30.8H, v27.8H, v4.8H // .........................*.............................. + // gap // ........................................................ + // gap // ........................................................ + sub v4.8H, v13.8H, v29.8H // ..........................*............................. + // gap // ........................................................ + // gap // ........................................................ + mul v24.8H, v26.8H, v10.8H // ....................*................................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mls v24.8H, v5.8H, v7.H[0] // ...........................*............................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mul v10.8H, v27.8H, v11.8H // ............................*........................... + add v27.8H, v13.8H, v29.8H // ........................*............................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mls v10.8H, v30.8H, v7.H[0] // .............................*.......................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sqrdmulh v15.8H, v4.8H, v3.8H // ..............................*......................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mul v25.8H, v4.8H, v6.8H // ................................*....................... + // gap // ........................................................ + // gap // ........................................................ + sub v4.8H, v24.8H, v10.8H // ...............................*........................ + // gap // ........................................................ + // gap // ........................................................ + add v0.8H, v24.8H, v10.8H // .....................................*.................. + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sqrdmulh v16.8H, v4.8H, v3.8H // .................................*...................... + // gap // ........................................................ + // gap // ........................................................ + trn2 v3.4S, v27.4S, v0.4S // ......................................*................. + // gap // ........................................................ + // gap // ........................................................ + mul v12.8H, v4.8H, v6.8H // ...................................*.................... + ldr q6, [x3], #16 // ............................................*........... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mls v25.8H, v15.8H, v7.H[0] // ..................................*..................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mls v12.8H, v16.8H, v7.H[0] // ....................................*................... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + trn1 v23.4S, v27.4S, v0.4S // .......................................*................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + trn2 v19.4S, v25.4S, v12.4S // ........................................*............... + trn1 v18.4S, v25.4S, v12.4S // .........................................*.............. + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + trn2 v29.2D, v3.2D, v19.2D // ..........................................*............. + trn2 v21.2D, v23.2D, v18.2D // ...........................................*............ + // gap // ........................................................ + trn1 v10.2D, v3.2D, v19.2D // .............................................*.......... + trn1 v8.2D, v23.2D, v18.2D // ..............................................*......... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + add v11.8H, v21.8H, v29.8H // ...............................................*........ + sub v1.8H, v21.8H, v29.8H // ................................................*....... + // gap // ........................................................ + sub v23.8H, v8.8H, v10.8H // .................................................*...... + add v0.8H, v8.8H, v10.8H // ..................................................*..... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sqdmulh v22.8H, v11.8H, v7.H[1] // ...................................................*.... + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + mul v2.8H, v1.8H, v6.H[4] // .......................................................* + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + // gap // ........................................................ + sqdmulh v12.8H, v0.8H, v7.H[1] // .....................................................*.. + // gap // ........................................................ + // gap // ........................................................ + srshr v3.8H, v22.8H, #11 // ......................................................*. + // gap // ........................................................ + // gap // ........................................................ + sqrdmulh v24.8H, v23.8H, v6.H[3] // ....................................................*... + // gap // ........................................................ + // gap // ........................................................ + + // original source code + // ldr q5, [x1, #32] // .*...................................................... + // ldr q22, [x1, #48] // *....................................................... + // ldr q12, [x1, #16] // ...*.................................................... + // ldr q17, [x1, #0] // ..*..................................................... + // ldr q25, [x4], #(6*16) // .....*.................................................. + // ldr q16, [x4, #-80] // .............*.......................................... + // trn2 v19.4S, v5.4S, v22.4S // ........*............................................... + // trn2 v23.4S, v17.4S, v12.4S // ...........*............................................ + // trn1 v8.4S, v5.4S, v22.4S // .........*.............................................. + // trn1 v22.4S, v17.4S, v12.4S // ............*........................................... + // trn1 v1.2D, v23.2D, v19.2D // ...............*........................................ + // trn2 v15.2D, v23.2D, v19.2D // ................*....................................... + // ldr q23, [x4, #-64] // .......*................................................ + // trn1 v27.2D, v22.2D, v8.2D // ..............*......................................... + // ldr q29, [x4, #-48] // ..........*............................................. + // ldr q14, [x4, #-32] // ......*................................................. + // trn2 v20.2D, v22.2D, v8.2D // .................*...................................... + // sub v3.8H, v27.8H, v1.8H // ..................*..................................... + // sub v21.8H, v20.8H, v15.8H // ....................*................................... + // add v15.8H, v20.8H, v15.8H // .....................*.................................. + // mul v12.8H, v3.8H, v23.8H // .........................*.............................. + // add v5.8H, v27.8H, v1.8H // ...................*.................................... + // sqrdmulh v23.8H, v3.8H, v29.8H // ......................*................................. + // ldr q28, [x4, #-16] // ....*................................................... + // add v29.8H, v5.8H, v15.8H // ............................*........................... + // sqrdmulh v18.8H, v21.8H, v28.8H // .......................*................................ + // sub v9.8H, v5.8H, v15.8H // ........................*............................... + // mls v12.8H, v23.8H, v7.H[0] // ..........................*............................. + // mul v27.8H, v21.8H, v14.8H // ...........................*............................ + // mls v27.8H, v18.8H, v7.H[0] // .............................*.......................... + // sqrdmulh v1.8H, v9.8H, v16.8H // ..............................*......................... + // sub v28.8H, v12.8H, v27.8H // ................................*....................... + // mul v31.8H, v9.8H, v25.8H // ...............................*........................ + // sqrdmulh v19.8H, v28.8H, v16.8H // ..................................*..................... + // mls v31.8H, v1.8H, v7.H[0] // ......................................*................. + // mul v1.8H, v28.8H, v25.8H // ....................................*................... + // mls v1.8H, v19.8H, v7.H[0] // .......................................*................ + // add v19.8H, v12.8H, v27.8H // .................................*...................... + // trn2 v3.4S, v29.4S, v19.4S // ...................................*.................... + // trn1 v10.4S, v29.4S, v19.4S // ........................................*............... + // trn2 v20.4S, v31.4S, v1.4S // .........................................*.............. + // trn1 v31.4S, v31.4S, v1.4S // ..........................................*............. + // trn2 v30.2D, v3.2D, v20.2D // ...........................................*............ + // trn2 v28.2D, v10.2D, v31.2D // ............................................*........... + // ldr q6, [x3], #16 // .....................................*.................. + // trn1 v17.2D, v3.2D, v20.2D // .............................................*.......... + // trn1 v5.2D, v10.2D, v31.2D // ..............................................*......... + // add v11.8H, v28.8H, v30.8H // ...............................................*........ + // sub v1.8H, v28.8H, v30.8H // ................................................*....... + // sub v23.8H, v5.8H, v17.8H // .................................................*...... + // add v0.8H, v5.8H, v17.8H // ..................................................*..... + // sqdmulh v8.8H, v11.8H, v7.H[1] // ...................................................*.... + // sqrdmulh v24.8H, v23.8H, v6.H[3] // .......................................................* + // sqdmulh v12.8H, v0.8H, v7.H[1] // .....................................................*.. + // srshr v3.8H, v8.8H, #11 // ......................................................*. + // mul v2.8H, v1.8H, v6.H[4] // ....................................................*... + + sub count, count, #1 +layer4567_start: + sqrdmulh v30.8H, v1.8H, v6.H[5] // .......................................................*........................... + ldr q5, [x1, #96] // ..e................................................................................ + ldr q22, [x1, #112] // ...e............................................................................... + srshr v28.8H, v12.8H, #11 // ..........................................................*........................ + ldr q12, [x1, #80] // .e................................................................................. + ldr q17, [x1, #64] // e.................................................................................. + ldr q25, [x4], #(6*16) // ............e...................................................................... + mul v26.8H, v23.8H, v6.H[2] // .................................................*................................. + ldr q16, [x4, #-80] // .............e..................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v19.4S, v5.4S, v22.4S // .......e........................................................................... + mls v26.8H, v24.8H, v7.H[0] // ...................................................*............................... + // gap // ................................................................................... + trn2 v23.4S, v17.4S, v12.4S // .....e............................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + mls v2.8H, v30.8H, v7.H[0] // ........................................................*.......................... + trn1 v8.4S, v5.4S, v22.4S // ......e............................................................................ + // gap // ................................................................................... + trn1 v22.4S, v17.4S, v12.4S // ....e.............................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v1.2D, v23.2D, v19.2D // ...........e....................................................................... + mls v11.8H, v3.8H, v7.H[0] // ..............................................................*.................... + // gap // ................................................................................... + trn2 v15.2D, v23.2D, v19.2D // .........e......................................................................... + ldr q23, [x4, #-64] // ..............e.................................................................... + // gap // ................................................................................... + trn1 v27.2D, v22.2D, v8.2D // ..........e........................................................................ + sqdmulh v24.8H, v26.8H, v7.H[1] // ...............................................................*................... + ldr q29, [x4, #-48] // ...............e................................................................... + ldr q14, [x4, #-32] // ................e.................................................................. + trn2 v20.2D, v22.2D, v8.2D // ........e.......................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v19.8H, v2.8H, v7.H[1] // ..................................................................*................ + sub v3.8H, v27.8H, v1.8H // ..................e................................................................ + // gap // ................................................................................... + // gap // ................................................................................... + sub v21.8H, v20.8H, v15.8H // .......................e........................................................... + mls v0.8H, v28.8H, v7.H[0] // ...........................................................*....................... + // gap // ................................................................................... + srshr v22.8H, v24.8H, #11 // ................................................................*.................. + // gap // ................................................................................... + // gap // ................................................................................... + add v15.8H, v20.8H, v15.8H // ........................e.......................................................... + mul v12.8H, v3.8H, v23.8H // ....................e.............................................................. + // gap // ................................................................................... + srshr v19.8H, v19.8H, #11 // ...................................................................*............... + // gap // ................................................................................... + // gap // ................................................................................... + add v5.8H, v27.8H, v1.8H // ...................e............................................................... + sqrdmulh v23.8H, v3.8H, v29.8H // .....................e............................................................. + ldr q28, [x4, #-16] // .................e................................................................. + add v10.8H, v0.8H, v11.8H // ......................................................................*............ + // gap // ................................................................................... + // gap // ................................................................................... + mls v2.8H, v19.8H, v7.H[0] // ....................................................................*.............. + // gap // ................................................................................... + // gap // ................................................................................... + add v29.8H, v5.8H, v15.8H // .............................e..................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q10, [x1], #(64) // ...............................................................................*... + sqrdmulh v18.8H, v21.8H, v28.8H // ..........................e........................................................ + sub v9.8H, v5.8H, v15.8H // ............................e...................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v12.8H, v23.8H, v7.H[0] // ......................e............................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v27.8H, v21.8H, v14.8H // .........................e......................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v27.8H, v18.8H, v7.H[0] // ...........................e....................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v1.8H, v9.8H, v16.8H // ...............................e................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v26.8H, v22.8H, v7.H[0] // .................................................................*................. + // gap // ................................................................................... + // gap // ................................................................................... + sub v28.8H, v12.8H, v27.8H // .................................e................................................. + // gap // ................................................................................... + // gap // ................................................................................... + mul v31.8H, v9.8H, v25.8H // ..............................e.................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v19.8H, v28.8H, v16.8H // ....................................e.............................................. + // gap // ................................................................................... + // gap // ................................................................................... + add v23.8H, v26.8H, v2.8H // ...........................................................................*....... + // gap // ................................................................................... + // gap // ................................................................................... + mls v31.8H, v1.8H, v7.H[0] // ................................e.................................................. + sub v22.8H, v26.8H, v2.8H // ..........................................................................*........ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q23, [x1, #-48] // ................................................................................*.. + mul v1.8H, v28.8H, v25.8H // ...................................e............................................... + sub v28.8H, v0.8H, v11.8H // .....................................................................*............. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v1.8H, v19.8H, v7.H[0] // .....................................e............................................. + add v19.8H, v12.8H, v27.8H // ..................................e................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v23.8H, v28.8H, v6.H[1] // ........................................................................*.......... + // gap // ................................................................................... + // gap // ................................................................................... + trn2 v3.4S, v29.4S, v19.4S // .......................................e........................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v10.4S, v29.4S, v19.4S // ......................................e............................................ + mul v4.8H, v28.8H, v6.H[0] // .......................................................................*........... + // gap // ................................................................................... + trn2 v20.4S, v31.4S, v1.4S // .........................................e......................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v31.4S, v31.4S, v1.4S // ........................................e.......................................... + // gap // ................................................................................... + sqrdmulh v19.8H, v22.8H, v6.H[1] // .............................................................................*..... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v4.8H, v23.8H, v7.H[0] // .........................................................................*......... + trn2 v30.2D, v3.2D, v20.2D // ...........................................e....................................... + // gap // ................................................................................... + trn2 v28.2D, v10.2D, v31.2D // ..........................................e........................................ + // gap // ................................................................................... + // gap // ................................................................................... + mul v22.8H, v22.8H, v6.H[0] // ............................................................................*...... + ldr q6, [x3], #16 // ..............................................e.................................... + trn1 v17.2D, v3.2D, v20.2D // .............................................e..................................... + trn1 v5.2D, v10.2D, v31.2D // ............................................e...................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v11.8H, v28.8H, v30.8H // .....................................................e............................. + mls v22.8H, v19.8H, v7.H[0] // ..............................................................................*.... + // gap // ................................................................................... + str q4, [x1, #-32] // .................................................................................*. + sub v1.8H, v28.8H, v30.8H // ....................................................e.............................. + // gap // ................................................................................... + sub v23.8H, v5.8H, v17.8H // ...............................................e................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v0.8H, v5.8H, v17.8H // ................................................e.................................. + sqdmulh v8.8H, v11.8H, v7.H[1] // ............................................................e...................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q22, [x1, #-16] // ..................................................................................* + sqrdmulh v24.8H, v23.8H, v6.H[3] // ..................................................e................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v12.8H, v0.8H, v7.H[1] // .........................................................e......................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v3.8H, v8.8H, #11 // .............................................................e..................... + // gap // ................................................................................... + // gap // ................................................................................... + mul v2.8H, v1.8H, v6.H[4] // ......................................................e............................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + + // original source code + // ldr q8, [x1, #(16*0)] // ....e.............................................................................|....e......................................................................... + // ldr q9, [x1, #(16*1)] // ...e..............................................................................|...e.......................................................................... + // ldr q10, [x1, #(16*2)] // e.................................................................................|e............................................................................. + // ldr q11, [x1, #(16*3)] // .e................................................................................|.e............................................................................ + // trn1 v25.4s, v8.4s, v9.4s // .............e....................................................................|.............e................................................................ + // trn2 v26.4s, v8.4s, v9.4s // ..........e.......................................................................|..........e................................................................... + // trn1 v27.4s, v10.4s, v11.4s // ............e.....................................................................|............e................................................................. + // trn2 v28.4s, v10.4s, v11.4s // ........e.........................................................................|........e..................................................................... + // trn2 v10.2d, v25.2d, v27.2d // ......................e...........................................................|......................e....................................................... + // trn2 v11.2d, v26.2d, v28.2d // ................e.................................................................|................e............................................................. + // trn1 v8.2d, v25.2d, v27.2d // ..................e...............................................................|..................e........................................................... + // trn1 v9.2d, v26.2d, v28.2d // ..............e...................................................................|..............e............................................................... + // ldr q0, [x4], #(6*16) // .....e............................................................................|.....e........................................................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // .......e..........................................................................|.......e...................................................................... + // ldr q1, [x4, #(-6*16 + 2*16)] // .................e................................................................|.................e............................................................ + // ldr q5, [x4, #(-6*16 + 3*16)] // ....................e.............................................................|....................e......................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // .....................e............................................................|.....................e........................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // .................................e................................................|.................................e............................................ + // sub v24.8h, v8.8h, v9.8h // ........................e.........................................................|........................e..................................................... + // add v8.8h, v8.8h, v9.8h // ...............................e..................................................|...............................e.............................................. + // mul v9.8h, v24.8h, v1.8h // .............................e....................................................|.............................e................................................ + // sqrdmulh v24.8h, v24.8h, v5.8h // ................................e.................................................|................................e............................................. + // mls v9.8h, v24.8h, v7.h[0] // ........................................e.........................................|........................................e..................................... + // sub v24.8h, v10.8h, v11.8h // .........................e........................................................|.........................e.................................................... + // add v10.8h, v10.8h, v11.8h // ............................e.....................................................|............................e................................................. + // mul v11.8h, v24.8h, v2.8h // .........................................e........................................|.........................................e.................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ......................................e...........................................|......................................e....................................... + // mls v11.8h, v24.8h, v7.h[0] // ..........................................e.......................................|..........................................e................................... + // sub v24.8h, v8.8h, v10.8h // .......................................e..........................................|.......................................e...................................... + // add v8.8h, v8.8h, v10.8h // ....................................e.............................................|....................................e......................................... + // mul v10.8h, v24.8h, v0.8h // ..............................................e...................................|..............................................e............................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ...........................................e......................................|...........................................e.................................. + // mls v10.8h, v24.8h, v7.h[0] // .................................................e................................|.................................................e............................ + // sub v24.8h, v9.8h, v11.8h // .............................................e....................................|.............................................e................................ + // add v9.8h, v9.8h, v11.8h // .......................................................e..........................|.......................................................e...................... + // mul v11.8h, v24.8h, v0.8h // ....................................................e.............................|....................................................e......................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ...............................................e..................................|...............................................e.............................. + // mls v11.8h, v24.8h, v7.h[0] // ......................................................e...........................|......................................................e....................... + // trn1 v25.4s, v8.4s, v9.4s // ..........................................................e.......................|..........................................................e................... + // trn2 v26.4s, v8.4s, v9.4s // .........................................................e........................|.........................................................e.................... + // trn1 v27.4s, v10.4s, v11.4s // .............................................................e....................|.............................................................e................ + // trn2 v28.4s, v10.4s, v11.4s // ............................................................e.....................|............................................................e................. + // trn2 v10.2d, v25.2d, v27.2d // .................................................................e................|.................................................................e............ + // trn2 v11.2d, v26.2d, v28.2d // ................................................................e.................|................................................................e............. + // trn1 v8.2d, v25.2d, v27.2d // .....................................................................e............|.....................................................................e........ + // trn1 v9.2d, v26.2d, v28.2d // ....................................................................e.............|....................................................................e......... + // ldr q0, [x3], #16 // ...................................................................e..............|...................................................................e.......... + // sub v24.8h, v8.8h, v9.8h // ..........................................................................e.......|..........................................................................e... + // add v8.8h, v8.8h, v9.8h // ...........................................................................e......|...........................................................................e.. + // mul v9.8h, v24.8h, v0.h[2] // ......*...........................................................................|......*....................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..............................................................................e...|.............................................................................. + // mls v9.8h, v24.8h, v7.h[0] // .........*........................................................................|.........*.................................................................... + // sub v24.8h, v10.8h, v11.8h // .........................................................................e........|.........................................................................e.... + // add v10.8h, v10.8h, v11.8h // ......................................................................e...........|......................................................................e....... + // mul v11.8h, v24.8h, v0.h[4] // .................................................................................e|.............................................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................................................................................*.............................................................................. + // mls v11.8h, v24.8h, v7.h[0] // ...........*......................................................................|...........*.................................................................. + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...............................................................................e..|.............................................................................. + // srshr v25.8h, v25.8h, #11 // ..*...............................................................................|..*........................................................................... + // mls v8.8h, v25.8h, v7.h[0] // ..........................*.......................................................|..........................*................................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ............................................................................e.....|............................................................................e. + // srshr v25.8h, v25.8h, #11 // ................................................................................e.|.............................................................................. + // mls v10.8h, v25.8h, v7.h[0] // ...............*..................................................................|...............*.............................................................. + // sqdmulh v25.8h, v9.8h, v7.h[1] // ...................*..............................................................|...................*.......................................................... + // srshr v25.8h, v25.8h, #11 // ...........................*......................................................|...........................*.................................................. + // mls v9.8h, v25.8h, v7.h[0] // ............................................*.....................................|............................................*................................. + // sqdmulh v25.8h, v11.8h, v7.h[1] // .......................*..........................................................|.......................*...................................................... + // srshr v25.8h, v25.8h, #11 // ..............................*...................................................|..............................*............................................... + // mls v11.8h, v25.8h, v7.h[0] // ...................................*..............................................|...................................*.......................................... + // sub v24.8h, v8.8h, v10.8h // .....................................................*............................|.....................................................*........................ + // add v8.8h, v8.8h, v10.8h // ..................................*...............................................|..................................*........................................... + // mul v10.8h, v24.8h, v0.h[0] // ...........................................................*......................|...........................................................*.................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ........................................................*.........................|........................................................*..................... + // mls v10.8h, v24.8h, v7.h[0] // ...............................................................*..................|...............................................................*.............. + // sub v24.8h, v9.8h, v11.8h // ..................................................*...............................|..................................................*........................... + // add v9.8h, v9.8h, v11.8h // ................................................*.................................|................................................*............................. + // mul v11.8h, v24.8h, v0.h[0] // ..................................................................*...............|..................................................................*........... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*...................|..............................................................*............... + // mls v11.8h, v24.8h, v7.h[0] // .......................................................................*..........|.......................................................................*...... + // str q8, [x1], #(64) // .....................................*............................................|.....................................*........................................ + // str q9, [x1, #(-64 + 16*1)] // ...................................................*..............................|...................................................*.......................... + // str q10, [x1, #(-64 + 16*2)] // ........................................................................*.........|........................................................................*..... + // str q11, [x1, #(-64 + 16*3)] // .............................................................................*....|.............................................................................* + + sub count, count, #1 + cbnz count, layer4567_start + sqrdmulh v8.8H, v1.8H, v6.H[5] // *.......................... + srshr v22.8H, v12.8H, #11 // .*......................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mul v4.8H, v23.8H, v6.H[2] // ..*........................ + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v4.8H, v24.8H, v7.H[0] // ...*....................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v2.8H, v8.8H, v7.H[0] // ....*...................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + sqdmulh v8.8H, v4.8H, v7.H[1] // ......*.................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + sqdmulh v17.8H, v2.8H, v7.H[1] // .......*................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v0.8H, v22.8H, v7.H[0] // ........*.................. + // gap // ........................... + // gap // ........................... + srshr v21.8H, v8.8H, #11 // .........*................. + // gap // ........................... + // gap // ........................... + mls v11.8H, v3.8H, v7.H[0] // .....*..................... + // gap // ........................... + // gap // ........................... + srshr v8.8H, v17.8H, #11 // ..........*................ + // gap // ........................... + // gap // ........................... + mls v4.8H, v21.8H, v7.H[0] // ..............*............ + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v2.8H, v8.8H, v7.H[0] // ............*.............. + // gap // ........................... + // gap // ........................... + sub v8.8H, v0.8H, v11.8H // ..................*........ + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + sqrdmulh v1.8H, v8.8H, v6.H[1] // ...................*....... + // gap // ........................... + // gap // ........................... + sub v25.8H, v4.8H, v2.8H // ................*.......... + // gap // ........................... + // gap // ........................... + mul v13.8H, v8.8H, v6.H[0] // ....................*...... + // gap // ........................... + // gap // ........................... + // gap // ........................... + add v8.8H, v0.8H, v11.8H // ...........*............... + // gap // ........................... + sqrdmulh v21.8H, v25.8H, v6.H[1] // .....................*..... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q8, [x1], #(64) // .............*............. + add v12.8H, v4.8H, v2.8H // ...............*........... + mul v8.8H, v25.8H, v6.H[0] // .......................*... + // gap // ........................... + // gap // ........................... + // gap // ........................... + mls v13.8H, v1.8H, v7.H[0] // ......................*.... + // gap // ........................... + // gap // ........................... + str q12, [x1, #-48] // .................*......... + // gap // ........................... + // gap // ........................... + mls v8.8H, v21.8H, v7.H[0] // ........................*.. + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q13, [x1, #-32] // .........................*. + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + // gap // ........................... + str q8, [x1, #-16] // ..........................* + // gap // ........................... + // gap // ........................... + + // original source code + // sqrdmulh v30.8H, v1.8H, v6.H[5] // *.......................... + // srshr v28.8H, v12.8H, #11 // .*......................... + // mul v26.8H, v23.8H, v6.H[2] // ..*........................ + // mls v26.8H, v24.8H, v7.H[0] // ...*....................... + // mls v2.8H, v30.8H, v7.H[0] // ....*...................... + // mls v11.8H, v3.8H, v7.H[0] // .........*................. + // sqdmulh v24.8H, v26.8H, v7.H[1] // .....*..................... + // sqdmulh v19.8H, v2.8H, v7.H[1] // ......*.................... + // mls v0.8H, v28.8H, v7.H[0] // .......*................... + // srshr v22.8H, v24.8H, #11 // ........*.................. + // srshr v19.8H, v19.8H, #11 // ..........*................ + // add v10.8H, v0.8H, v11.8H // .................*......... + // mls v2.8H, v19.8H, v7.H[0] // ............*.............. + // str q10, [x1], #(64) // ...................*....... + // mls v26.8H, v22.8H, v7.H[0] // ...........*............... + // add v23.8H, v26.8H, v2.8H // ....................*...... + // sub v22.8H, v26.8H, v2.8H // ...............*........... + // str q23, [x1, #-48] // .......................*... + // sub v28.8H, v0.8H, v11.8H // .............*............. + // sqrdmulh v23.8H, v28.8H, v6.H[1] // ..............*............ + // mul v4.8H, v28.8H, v6.H[0] // ................*.......... + // sqrdmulh v19.8H, v22.8H, v6.H[1] // ..................*........ + // mls v4.8H, v23.8H, v7.H[0] // ......................*.... + // mul v22.8H, v22.8H, v6.H[0] // .....................*..... + // mls v22.8H, v19.8H, v7.H[0] // ........................*.. + // str q4, [x1, #-32] // .........................*. + // str q22, [x1, #-16] // ..........................* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q23, [x0, #384] // *................. + ldr q19, [x0, #448] // .*................ + // gap // .................. + ldr q2, [x0, #320] // ..*............... + // gap // .................. + // gap // .................. + ldr q5, [x0, #256] // ....*............. + // gap // .................. + // gap // .................. + ldr q4, [x0, #128] // ......*........... + // gap // .................. + // gap // .................. + add v24.8H, v23.8H, v19.8H // ............*..... + sub v23.8H, v23.8H, v19.8H // .....*............ + ldr q19, [x0, #192] // .......*.......... + ldr q20, [x0, #0] // ..........*....... + // gap // .................. + // gap // .................. + ldr q26, [x0, #64] // ...*.............. + add v17.8H, v5.8H, v2.8H // ........*......... + // gap // .................. + sqrdmulh v13.8H, v23.8H, v1.H[5] // ...........*...... + // gap // .................. + // gap // .................. + sub v9.8H, v4.8H, v19.8H // .........*........ + // gap // .................. + // gap // .................. + add v27.8H, v4.8H, v19.8H // ..............*... + mul v11.8H, v23.8H, v1.H[4] // .............*.... + // gap // .................. + add v25.8H, v20.8H, v26.8H // ................*. + // gap // .................. + // gap // .................. + sub v23.8H, v20.8H, v26.8H // .................* + sqrdmulh v16.8H, v9.8H, v1.H[1] // ...............*.. + // gap // .................. + + // original source code + // ldr q14, [x0, #384] // *................. + // ldr q26, [x0, #448] // .*................ + // ldr q2, [x0, #320] // ..*............... + // ldr q28, [x0, #64] // .........*........ + // ldr q5, [x0, #256] // ...*.............. + // sub v8.8H, v14.8H, v26.8H // ......*........... + // ldr q3, [x0, #128] // ....*............. + // ldr q20, [x0, #192] // .......*.......... + // add v17.8H, v5.8H, v2.8H // ..........*....... + // sub v9.8H, v3.8H, v20.8H // ............*..... + // ldr q23, [x0, #0] // ........*......... + // sqrdmulh v13.8H, v8.8H, v1.H[5] // ...........*...... + // add v24.8H, v14.8H, v26.8H // .....*............ + // mul v11.8H, v8.8H, v1.H[4] // ..............*... + // add v27.8H, v3.8H, v20.8H // .............*.... + // sqrdmulh v16.8H, v9.8H, v1.H[1] // .................* + // add v25.8H, v23.8H, v28.8H // ...............*.. + // sub v23.8H, v23.8H, v28.8H // ................*. + + sub count, count, #1 +layer123_start: + mls v11.8H, v13.8H, v7.H[0] // ...........................*............................................................ + sub v28.8H, v5.8H, v2.8H // ..................*..................................................................... + // gap // ........................................................................................ + ldr q14, [x0, #400] // ......e................................................................................. + // gap // ........................................................................................ + sub v5.8H, v25.8H, v27.8H // ............................*........................................................... + mul v19.8H, v23.8H, v0.H[6] // ..........*............................................................................. + add v27.8H, v25.8H, v27.8H // .............................*.......................................................... + // gap // ........................................................................................ + add v20.8H, v17.8H, v24.8H // .......................................*................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v6.8H, v17.8H, v24.8H // ......................................*................................................. + // gap // ........................................................................................ + sqrdmulh v31.8H, v28.8H, v1.H[3] // .....................*.................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v22.8H, v23.8H, v0.H[7] // ...........*............................................................................ + // gap // ........................................................................................ + sub v4.8H, v27.8H, v20.8H // ................................................*....................................... + add v27.8H, v27.8H, v20.8H // .................................................*...................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v13.8H, v6.8H, v0.H[4] // ........................................*............................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v28.8H, v28.8H, v1.H[2] // ....................*................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v20.8H, v9.8H, v1.H[0] // ...............*........................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v20.8H, v16.8H, v7.H[0] // .................*...................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v24.8H, v5.8H, v0.H[2] // ..............................*......................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v19.8H, v22.8H, v7.H[0] // ............*........................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v28.8H, v31.8H, v7.H[0] // ......................*................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v22.8H, v6.8H, v0.H[5] // .........................................*.............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v26.8H, v19.8H, v20.8H // .................................*...................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v18.8H, v19.8H, v20.8H // ..................................*..................................................... + sqrdmulh v19.8H, v5.8H, v0.H[3] // ...............................*........................................................ + // gap // ........................................................................................ + add v23.8H, v28.8H, v11.8H // ............................................*........................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v10.8H, v28.8H, v11.8H // ...........................................*............................................ + sqrdmulh v11.8H, v4.8H, v0.H[1] // ...................................................*.................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v6.8H, v18.8H, v23.8H // ......................................................*................................. + // gap // ........................................................................................ + mul v8.8H, v26.8H, v0.H[2] // ...................................*.................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v28.8H, v26.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q26, [x0, #464] // .......e................................................................................ + // gap // ........................................................................................ + mls v13.8H, v22.8H, v7.H[0] // ..........................................*............................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v22.8H, v4.8H, v0.H[0] // ..................................................*..................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v24.8H, v19.8H, v7.H[0] // ................................*....................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v8.8H, v28.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v28.8H, v10.8H, v0.H[4] // .............................................*.......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v5.8H, v24.8H, v13.8H // ...........................................................*............................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v11.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v3.8H, v24.8H, v13.8H // ..........................................................*............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v24.8H, v18.8H, v23.8H // .....................................................*.................................. + sqrdmulh v23.8H, v10.8H, v0.H[5] // ..............................................*......................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v25.8H, v3.8H, v0.H[1] // .............................................................*.......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q22, [x0, #256] // ....................................................................*................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v22.8H, v3.8H, v0.H[0] // ............................................................*........................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v28.8H, v23.8H, v7.H[0] // ...............................................*........................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v25.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v3.8H, v27.8H, v29.8H // ........................................................................*............... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v27.8H, v27.8H, v30.8H // .........................................................................*.............. + // gap // ........................................................................................ + // gap // ........................................................................................ + str q22, [x0, #384] // ......................................................................*................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v22.8H, v6.8H, v29.8H // ...........................................................................*............ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v10.8H, v8.8H, v28.8H // ...............................................................*........................ + sqrdmulh v19.8H, v6.8H, v30.8H // ............................................................................*........... + add v28.8H, v8.8H, v28.8H // ................................................................*....................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v3.8H, v27.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v24.8H, v0.H[1] // ........................................................*............................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v20.8H, v10.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q3, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v11.8H, v24.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v11.8H, v23.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v5.8H, v30.8H // ...............................................................................*........ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v27.8H, v5.8H, v29.8H // ..............................................................................*......... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q11, [x0, #304] // .....................................................................*.................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v24.8H, v10.8H, v0.H[0] // .................................................................*...................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v8.8H, v28.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v27.8H, v23.8H, v7.H[0] // ................................................................................*....... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v23.8H, v28.8H, v29.8H // .................................................................................*...... + ldr q2, [x0, #320] // .....e.................................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q28, [x0, #64] // .e...................................................................................... + // gap // ........................................................................................ + mls v23.8H, v8.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + ldr q5, [x0, #256] // ....e................................................................................... + str q27, [x0, #112] // ......................................................................................*. + sub v8.8H, v14.8H, v26.8H // .......................e................................................................ + ldr q3, [x0, #128] // ..e..................................................................................... + mls v24.8H, v20.8H, v7.H[0] // ...................................................................*.................... + ldr q20, [x0, #192] // ...e.................................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v22.8H, v19.8H, v7.H[0] // .............................................................................*.......... + // gap // ........................................................................................ + add v17.8H, v5.8H, v2.8H // ...................e.................................................................... + str q23, [x0, #176] // .......................................................................................* + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v9.8H, v3.8H, v20.8H // .............e.......................................................................... + ldr q23, [x0, #0] // e....................................................................................... + sqrdmulh v13.8H, v8.8H, v1.H[5] // ..........................e............................................................. + str q24, [x0, #432] // .......................................................................*................ + add v24.8H, v14.8H, v26.8H // ........................e............................................................... + // gap // ........................................................................................ + mul v11.8H, v8.8H, v1.H[4] // .........................e.............................................................. + add v27.8H, v3.8H, v20.8H // ..............e......................................................................... + // gap // ........................................................................................ + str q22, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v16.8H, v9.8H, v1.H[1] // ................e....................................................................... + add v25.8H, v23.8H, v28.8H // .........e.............................................................................. + // gap // ........................................................................................ + sub v23.8H, v23.8H, v28.8H // ........e............................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + + // original source code + // ldr q8, [x0, #0] // ............................................................................e.........|.............................................................................e...... + // ldr q9, [x0, #(1*(512/8))] // ................................................................e.....................|.................................................................e.................. + // ldr q10, [x0, #(2*(512/8))] // .....................................................................e................|......................................................................e............. + // ldr q11, [x0, #(3*(512/8))] // .......................................................................e..............|........................................................................e........... + // ldr q12, [x0, #(4*(512/8))] // ..................................................................e...................|...................................................................e................ + // ldr q13, [x0, #(5*(512/8))] // ...............................................................e......................|................................................................e................... + // ldr q14, [x0, #(6*(512/8))] // e.....................................................................................|.e.................................................................................. + // ldr q15, [x0, #(7*(512/8))] // ...........................e..........................................................|............................e....................................................... + // sub v24.8h, v8.8h, v9.8h // .....................................................................................e|.................................................................................... + // add v8.8h, v8.8h, v9.8h // ....................................................................................e.|.................................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ..*...................................................................................|...*................................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[7] // .......*..............................................................................|........*........................................................................... + // mls v9.8h, v24.8h, v7.h[0] // ...............*......................................................................|................*................................................................... + // sub v24.8h, v10.8h, v11.8h // ...........................................................................e..........|............................................................................e....... + // add v10.8h, v10.8h, v11.8h // .................................................................................e....|..................................................................................e. + // mul v11.8h, v24.8h, v1.h[0] // ............*.........................................................................|.............*...................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ...................................................................................e..|.................................................................................... + // mls v11.8h, v24.8h, v7.h[0] // .............*........................................................................|..............*..................................................................... + // sub v24.8h, v12.8h, v13.8h // ......................................................................................|*................................................................................... + // add v12.8h, v12.8h, v13.8h // .........................................................................e............|..........................................................................e......... + // mul v13.8h, v24.8h, v1.h[2] // ...........*..........................................................................|............*....................................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ......*...............................................................................|.......*............................................................................ + // mls v13.8h, v24.8h, v7.h[0] // ................*.....................................................................|.................*.................................................................. + // sub v24.8h, v14.8h, v15.8h // ....................................................................e.................|.....................................................................e.............. + // add v14.8h, v14.8h, v15.8h // ...............................................................................e......|................................................................................e... + // mul v15.8h, v24.8h, v1.h[4] // ................................................................................e.....|.................................................................................e.. + // sqrdmulh v24.8h, v24.8h, v1.h[5] // .............................................................................e........|..............................................................................e..... + // mls v15.8h, v24.8h, v7.h[0] // ......................................................................................*.................................................................................... + // sub v24.8h, v8.8h, v10.8h // .*....................................................................................|..*................................................................................. + // add v8.8h, v8.8h, v10.8h // ...*..................................................................................|....*............................................................................... + // mul v10.8h, v24.8h, v0.h[2] // ..............*.......................................................................|...............*.................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ....................*.................................................................|.....................*.............................................................. + // mls v10.8h, v24.8h, v7.h[0] // ..............................*.......................................................|...............................*.................................................... + // sub v24.8h, v9.8h, v11.8h // ..................*...................................................................|...................*................................................................ + // add v9.8h, v9.8h, v11.8h // ...................*..................................................................|....................*............................................................... + // mul v11.8h, v24.8h, v0.h[2] // .........................*............................................................|..........................*......................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................*...........................................................|...........................*........................................................ + // mls v11.8h, v24.8h, v7.h[0] // ...............................*......................................................|................................*................................................... + // sub v24.8h, v12.8h, v14.8h // .....*................................................................................|......*............................................................................. + // add v12.8h, v12.8h, v14.8h // ....*.................................................................................|.....*.............................................................................. + // mul v14.8h, v24.8h, v0.h[4] // ..........*...........................................................................|...........*........................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .................*....................................................................|..................*................................................................. + // mls v14.8h, v24.8h, v7.h[0] // ............................*.........................................................|.............................*...................................................... + // sub v24.8h, v13.8h, v15.8h // ......................*...............................................................|.......................*............................................................ + // add v13.8h, v13.8h, v15.8h // .....................*................................................................|......................*............................................................. + // mul v15.8h, v24.8h, v0.h[4] // ................................*.....................................................|.................................*.................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .....................................*................................................|......................................*............................................. + // mls v15.8h, v24.8h, v7.h[0] // .........................................*............................................|..........................................*......................................... + // sub v24.8h, v8.8h, v12.8h // ........*.............................................................................|.........*.......................................................................... + // add v8.8h, v8.8h, v12.8h // .........*............................................................................|..........*......................................................................... + // mul v12.8h, v24.8h, v0.h[0] // .............................*........................................................|..............................*..................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .......................*..............................................................|........................*........................................................... + // mls v12.8h, v24.8h, v7.h[0] // ..................................*...................................................|...................................*................................................ + // sub v24.8h, v9.8h, v13.8h // ....................................*.................................................|.....................................*.............................................. + // add v9.8h, v9.8h, v13.8h // ........................*.............................................................|.........................*.......................................................... + // mul v13.8h, v24.8h, v0.h[0] // ......................................................*...............................|.......................................................*............................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................................*..................................|....................................................*............................... + // mls v13.8h, v24.8h, v7.h[0] // .......................................................*..............................|........................................................*........................... + // sub v24.8h, v10.8h, v14.8h // ...................................*..................................................|....................................*............................................... + // add v10.8h, v10.8h, v14.8h // .................................*....................................................|..................................*................................................. + // mul v14.8h, v24.8h, v0.h[0] // ........................................*.............................................|.........................................*.......................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ......................................*...............................................|.......................................*............................................ + // mls v14.8h, v24.8h, v7.h[0] // ..........................................*...........................................|...........................................*........................................ + // sub v24.8h, v11.8h, v15.8h // ...............................................*......................................|................................................*................................... + // add v11.8h, v11.8h, v15.8h // .................................................*....................................|..................................................*................................. + // mul v15.8h, v24.8h, v0.h[0] // ...........................................................*..........................|............................................................*....................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................*.................................|.....................................................*.............................. + // mls v15.8h, v24.8h, v7.h[0] // ......................................................................*...............|.......................................................................*............ + // str q12, [x0, #(4*(512/8))] // .......................................*..............................................|........................................*........................................... + // str q13, [x0, #(5*(512/8))] // ..........................................................*...........................|...........................................................*........................ + // str q14, [x0, #(6*(512/8))] // .............................................*........................................|..............................................*..................................... + // str q15, [x0, #(7*(512/8))] // ..............................................................................*.......|...............................................................................*.... + // mul v12.8h, v8.8h, v29.8h // ...........................................*..........................................|............................................*....................................... + // sqrdmulh v8.8h, v8.8h, v30.8h // ............................................*.........................................|.............................................*...................................... + // mls v12.8h, v8.8h, v7.h[0] // ..................................................*...................................|...................................................*................................ + // mul v13.8h, v9.8h, v29.8h // ..............................................*.......................................|...............................................*.................................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ................................................*.....................................|.................................................*.................................. + // mls v13.8h, v9.8h, v7.h[0] // ........................................................................*.............|.........................................................................*.......... + // mul v14.8h, v10.8h, v29.8h // .........................................................*............................|..........................................................*......................... + // sqrdmulh v10.8h, v10.8h, v30.8h // ........................................................*.............................|.........................................................*.......................... + // mls v14.8h, v10.8h, v7.h[0] // .............................................................*........................|..............................................................*..................... + // mul v15.8h, v11.8h, v29.8h // ..............................................................*.......................|...............................................................*.................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ............................................................*.........................|.............................................................*...................... + // mls v15.8h, v11.8h, v7.h[0] // .................................................................*....................|..................................................................*................. + // str q12, [x0], #(16) // .....................................................*................................|......................................................*............................. + // str q13, [x0, #(-16 + 1*(512/8))] // ..................................................................................*...|...................................................................................* + // str q14, [x0, #(-16 + 2*(512/8))] // ...................................................................*..................|....................................................................*............... + // str q15, [x0, #(-16 + 3*(512/8))] // ..........................................................................*...........|...........................................................................*........ + + sub count, count, #1 + cbnz count, layer123_start + mls v11.8H, v13.8H, v7.H[0] // *..................................................................... + sub v28.8H, v17.8H, v24.8H // ......*............................................................... + // gap // ...................................................................... + add v31.8H, v25.8H, v27.8H // ....*................................................................. + // gap // ...................................................................... + // gap // ...................................................................... + sub v10.8H, v25.8H, v27.8H // ..*................................................................... + mul v12.8H, v9.8H, v1.H[0] // .............*........................................................ + // gap // ...................................................................... + add v24.8H, v17.8H, v24.8H // .....*................................................................ + // gap // ...................................................................... + // gap // ...................................................................... + mul v14.8H, v28.8H, v0.H[4] // ...........*.......................................................... + sub v3.8H, v5.8H, v2.8H // .*.................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v8.8H, v28.8H, v0.H[5] // ..................*................................................... + add v19.8H, v31.8H, v24.8H // ..........*........................................................... + // gap // ...................................................................... + sub v15.8H, v31.8H, v24.8H // .........*............................................................ + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v22.8H, v10.8H, v0.H[3] // .....................*................................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v28.8H, v10.8H, v0.H[2] // ...............*...................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v14.8H, v8.8H, v7.H[0] // ............................*......................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v28.8H, v22.8H, v7.H[0] // ..............................*....................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v2.8H, v23.8H, v0.H[7] // ........*............................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v12.8H, v16.8H, v7.H[0] // ..............*....................................................... + // gap // ...................................................................... + // gap // ...................................................................... + add v27.8H, v28.8H, v14.8H // .................................*.................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v18.8H, v28.8H, v14.8H // ...................................*.................................. + mul v28.8H, v23.8H, v0.H[6] // ...*.................................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v23.8H, v3.8H, v1.H[3] // .......*.............................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v28.8H, v2.8H, v7.H[0] // ................*..................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v6.8H, v3.8H, v1.H[2] // ............*......................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v6.8H, v23.8H, v7.H[0] // .................*.................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + add v17.8H, v28.8H, v12.8H // ....................*................................................. + mul v23.8H, v27.8H, v29.8H // .........................................................*............ + sub v28.8H, v28.8H, v12.8H // ...................*.................................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v10.8H, v18.8H, v0.H[1] // ......................................*............................... + // gap // ...................................................................... + // gap // ...................................................................... + add v16.8H, v6.8H, v11.8H // ......................*............................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v22.8H, v18.8H, v0.H[0] // ........................................*............................. + sub v9.8H, v6.8H, v11.8H // .......................*.............................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + add v6.8H, v17.8H, v16.8H // .........................*............................................ + sqrdmulh v20.8H, v27.8H, v30.8H // ........................................................*............. + // gap // ...................................................................... + sub v14.8H, v17.8H, v16.8H // ....................................*................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v22.8H, v10.8H, v7.H[0] // ..........................................*........................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v2.8H, v14.8H, v0.H[1] // ...................................................*.................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v11.8H, v6.8H, v29.8H // ..............................................*....................... + // gap // ...................................................................... + // gap // ...................................................................... + str q22, [x0, #384] // .............................................*........................ + // gap // ...................................................................... + // gap // ...................................................................... + mls v23.8H, v20.8H, v7.H[0] // .............................................................*........ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v5.8H, v15.8H, v0.H[0] // .............................*........................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v22.8H, v28.8H, v0.H[3] // ...........................*.......................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q23, [x0, #128] // ................................................................*..... + // gap // ...................................................................... + // gap // ...................................................................... + mul v3.8H, v28.8H, v0.H[2] // ..........................*........................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v26.8H, v6.8H, v30.8H // ................................................*..................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v3.8H, v22.8H, v7.H[0] // ...............................*...................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v17.8H, v9.8H, v0.H[5] // .....................................*................................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v28.8H, v15.8H, v0.H[1] // ........................*............................................. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mul v23.8H, v9.8H, v0.H[4] // ................................*..................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v23.8H, v17.8H, v7.H[0] // .........................................*............................ + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v11.8H, v26.8H, v7.H[0] // ..................................................................*... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v5.8H, v28.8H, v7.H[0] // ..................................*................................... + // gap // ...................................................................... + // gap // ...................................................................... + sub v24.8H, v3.8H, v23.8H // ...............................................*...................... + // gap // ...................................................................... + // gap // ...................................................................... + add v31.8H, v3.8H, v23.8H // .................................................*.................... + mul v3.8H, v14.8H, v0.H[0] // ......................................................*............... + // gap // ...................................................................... + str q11, [x0, #64] // .....................................................................* + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v4.8H, v24.8H, v0.H[1] // ....................................................*................. + // gap // ...................................................................... + // gap // ...................................................................... + str q5, [x0, #256] // .......................................*.............................. + // gap // ...................................................................... + // gap // ...................................................................... + mul v8.8H, v24.8H, v0.H[0] // ...........................................................*.......... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v27.8H, v19.8H, v30.8H // ............................................*......................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v8.8H, v4.8H, v7.H[0] // .................................................................*.... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v3.8H, v2.8H, v7.H[0] // .......................................................*.............. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + sqrdmulh v25.8H, v31.8H, v30.8H // ............................................................*......... + str q8, [x0, #448] // ....................................................................*. + // gap // ...................................................................... + // gap // ...................................................................... + mul v12.8H, v31.8H, v29.8H // ..............................................................*....... + // gap // ...................................................................... + // gap // ...................................................................... + str q3, [x0, #320] // ..........................................................*........... + // gap // ...................................................................... + // gap // ...................................................................... + mul v22.8H, v19.8H, v29.8H // ...........................................*.......................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v12.8H, v25.8H, v7.H[0] // ...............................................................*...... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + mls v22.8H, v27.8H, v7.H[0] // ..................................................*................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q12, [x0, #192] // ...................................................................*.. + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + // gap // ...................................................................... + str q22, [x0], #(16) // .....................................................*................ + // gap // ...................................................................... + // gap // ...................................................................... + + // original source code + // mls v11.8H, v13.8H, v7.H[0] // *..................................................................... + // sub v28.8H, v5.8H, v2.8H // .......*.............................................................. + // sub v5.8H, v25.8H, v27.8H // ...*.................................................................. + // mul v19.8H, v23.8H, v0.H[6] // ...................*.................................................. + // add v27.8H, v25.8H, v27.8H // ..*................................................................... + // add v20.8H, v17.8H, v24.8H // .....*................................................................ + // sub v6.8H, v17.8H, v24.8H // .*.................................................................... + // sqrdmulh v31.8H, v28.8H, v1.H[3] // ....................*................................................. + // sqrdmulh v22.8H, v23.8H, v0.H[7] // ...............*...................................................... + // sub v4.8H, v27.8H, v20.8H // ..........*........................................................... + // add v27.8H, v27.8H, v20.8H // .........*............................................................ + // mul v13.8H, v6.8H, v0.H[4] // ......*............................................................... + // mul v28.8H, v28.8H, v1.H[2] // ......................*............................................... + // mul v20.8H, v9.8H, v1.H[0] // ....*................................................................. + // mls v20.8H, v16.8H, v7.H[0] // ................*..................................................... + // mul v24.8H, v5.8H, v0.H[2] // ............*......................................................... + // mls v19.8H, v22.8H, v7.H[0] // .....................*................................................ + // mls v28.8H, v31.8H, v7.H[0] // .......................*.............................................. + // sqrdmulh v22.8H, v6.8H, v0.H[5] // ........*............................................................. + // sub v26.8H, v19.8H, v20.8H // ..........................*........................................... + // add v18.8H, v19.8H, v20.8H // ........................*............................................. + // sqrdmulh v19.8H, v5.8H, v0.H[3] // ...........*.......................................................... + // add v23.8H, v28.8H, v11.8H // ............................*......................................... + // sub v10.8H, v28.8H, v11.8H // ..............................*....................................... + // sqrdmulh v11.8H, v4.8H, v0.H[1] // ..............................................*....................... + // add v6.8H, v18.8H, v23.8H // ...............................*...................................... + // mul v8.8H, v26.8H, v0.H[2] // ..........................................*........................... + // sqrdmulh v28.8H, v26.8H, v0.H[3] // ........................................*............................. + // mls v13.8H, v22.8H, v7.H[0] // .............*........................................................ + // mul v22.8H, v4.8H, v0.H[0] // .......................................*.............................. + // mls v24.8H, v19.8H, v7.H[0] // ..............*....................................................... + // mls v8.8H, v28.8H, v7.H[0] // ............................................*......................... + // mul v28.8H, v10.8H, v0.H[4] // ...............................................*...................... + // add v5.8H, v24.8H, v13.8H // .................*.................................................... + // mls v22.8H, v11.8H, v7.H[0] // ..................................................*................... + // sub v3.8H, v24.8H, v13.8H // ..................*................................................... + // sub v24.8H, v18.8H, v23.8H // .................................*.................................... + // sqrdmulh v23.8H, v10.8H, v0.H[5] // .............................................*........................ + // sqrdmulh v25.8H, v3.8H, v0.H[1] // ...........................*.......................................... + // str q22, [x0, #256] // ........................................................*............. + // mul v22.8H, v3.8H, v0.H[0] // .............................*........................................ + // mls v28.8H, v23.8H, v7.H[0] // ................................................*..................... + // mls v22.8H, v25.8H, v7.H[0] // ..................................*................................... + // mul v3.8H, v27.8H, v29.8H // .................................................................*.... + // sqrdmulh v27.8H, v27.8H, v30.8H // ..........................................................*........... + // str q22, [x0, #384] // .....................................*................................ + // mul v22.8H, v6.8H, v29.8H // ....................................*................................. + // sub v10.8H, v8.8H, v28.8H // ...................................................*.................. + // sqrdmulh v19.8H, v6.8H, v30.8H // ...........................................*.......................... + // add v28.8H, v8.8H, v28.8H // ....................................................*................. + // mls v3.8H, v27.8H, v7.H[0] // ...................................................................*.. + // sqrdmulh v23.8H, v24.8H, v0.H[1] // ...................................*.................................. + // sqrdmulh v20.8H, v10.8H, v0.H[1] // .......................................................*.............. + // str q3, [x0], #(16) // .....................................................................* + // mul v11.8H, v24.8H, v0.H[0] // .....................................................*................ + // mls v11.8H, v23.8H, v7.H[0] // ............................................................*......... + // sqrdmulh v23.8H, v5.8H, v30.8H // ................................*..................................... + // mul v27.8H, v5.8H, v29.8H // .........................*............................................ + // str q11, [x0, #304] // ................................................................*..... + // mul v24.8H, v10.8H, v0.H[0] // .........................................................*............ + // sqrdmulh v8.8H, v28.8H, v30.8H // .............................................................*........ + // mls v27.8H, v23.8H, v7.H[0] // ......................................*............................... + // mul v23.8H, v28.8H, v29.8H // ...............................................................*...... + // mls v23.8H, v8.8H, v7.H[0] // ..................................................................*... + // str q27, [x0, #112] // .........................................*............................ + // mls v24.8H, v20.8H, v7.H[0] // ...........................................................*.......... + // mls v22.8H, v19.8H, v7.H[0] // .................................................*.................... + // str q23, [x0, #176] // ....................................................................*. + // str q24, [x0, #432] // ..............................................................*....... + // str q22, [x0, #48] // ......................................................*............... + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_firestorm.s b/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_firestorm.s new file mode 100644 index 00000000..d3209bce --- /dev/null +++ b/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_firestorm.s @@ -0,0 +1,1776 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_opt_m1_firestorm + .global _intt_kyber_123_4567_opt_m1_firestorm + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_opt_m1_firestorm: +_intt_kyber_123_4567_opt_m1_firestorm: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q27, [x4, #48] // ...........*............................... + ldr q15, [x1, #0] // ....*...................................... + ldr q29, [x4, #32] // ........*.................................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q24, [x1, #16] // .....*..................................... + ldr q16, [x1, #48] // ..*........................................ + ldr q21, [x1, #32] // .*......................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q5, [x3], #16 // ..........................................* + ldr q31, [x4, #64] // ......*.................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn1 v28.4S, v15.4S, v24.4S // .........*................................. + trn1 v25.4S, v21.4S, v16.4S // ..........*................................ + trn2 v19.4S, v21.4S, v16.4S // ............*.............................. + ldr q21, [x4, #80] // ...*....................................... + trn2 v26.4S, v15.4S, v24.4S // .............*............................. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn1 v9.2D, v26.2D, v19.2D // ................*.......................... + trn2 v12.2D, v26.2D, v19.2D // .................*......................... + trn1 v13.2D, v28.2D, v25.2D // ..............*............................ + trn2 v2.2D, v28.2D, v25.2D // ...............*........................... + ldr q25, [x4, #16] // .......*................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + ldr q22, [x4], #(6*16) // *.......................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sub v1.8H, v13.8H, v9.8H // ...................*....................... + add v0.8H, v13.8H, v9.8H // ....................*...................... + sub v11.8H, v2.8H, v12.8H // ..................*........................ + add v17.8H, v2.8H, v12.8H // ...........................*............... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v21.8H, v11.8H, v21.8H // ........................*.................. + mul v19.8H, v1.8H, v29.8H // .......................*................... + mul v31.8H, v11.8H, v31.8H // ......................*.................... + sqrdmulh v28.8H, v1.8H, v27.8H // .....................*..................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + add v12.8H, v0.8H, v17.8H // ............................*.............. + sub v0.8H, v0.8H, v17.8H // .............................*............. + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v13.8H, v0.8H, v25.8H // ................................*.......... + mls v31.8H, v21.8H, v7.H[0] // ..........................*................ + mls v19.8H, v28.8H, v7.H[0] // .........................*................. + mul v8.8H, v0.8H, v22.8H // ...............................*........... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v8.8H, v13.8H, v7.H[0] // .....................................*..... + sub v4.8H, v19.8H, v31.8H // ..............................*............ + add v13.8H, v19.8H, v31.8H // ...................................*....... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + sqrdmulh v14.8H, v4.8H, v25.8H // .................................*......... + mul v21.8H, v4.8H, v22.8H // ..................................*........ + trn2 v23.4S, v12.4S, v13.4S // .......................................*... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + mls v21.8H, v14.8H, v7.H[0] // ....................................*...... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + trn2 v24.4S, v8.4S, v21.4S // .........................................*. + trn1 v0.4S, v8.4S, v21.4S // ........................................*.. + trn1 v8.4S, v12.4S, v13.4S // ......................................*.... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + // gap // ........................................... + + // original source code + // ldr q13, [x4], #(6*16) // ..................*........................ + // ldr q27, [x1, #32] // .....*..................................... + // ldr q1, [x1, #48] // ....*...................................... + // ldr q4, [x4, #-16] // ...........*............................... + // ldr q17, [x1, #0] // .*......................................... + // ldr q29, [x1, #16] // ...*....................................... + // ldr q2, [x4, #-32] // .......*................................... + // ldr q21, [x4, #-80] // .................*......................... + // ldr q28, [x4, #-64] // ..*........................................ + // trn1 v3.4S, v17.4S, v29.4S // ........*.................................. + // trn1 v26.4S, v27.4S, v1.4S // .........*................................. + // ldr q25, [x4, #-48] // *.......................................... + // trn2 v20.4S, v27.4S, v1.4S // ..........*................................ + // trn2 v29.4S, v17.4S, v29.4S // ............*.............................. + // trn1 v8.2D, v3.2D, v26.2D // ...............*........................... + // trn2 v3.2D, v3.2D, v26.2D // ................*.......................... + // trn1 v1.2D, v29.2D, v20.2D // .............*............................. + // trn2 v20.2D, v29.2D, v20.2D // ..............*............................ + // sub v27.8H, v3.8H, v20.8H // .....................*..................... + // sub v12.8H, v8.8H, v1.8H // ...................*....................... + // add v18.8H, v8.8H, v1.8H // ....................*...................... + // sqrdmulh v17.8H, v12.8H, v25.8H // ..........................*................ + // mul v14.8H, v27.8H, v2.8H // .........................*................. + // mul v9.8H, v12.8H, v28.8H // ........................*.................. + // sqrdmulh v31.8H, v27.8H, v4.8H // .......................*................... + // mls v9.8H, v17.8H, v7.H[0] // ...............................*........... + // mls v14.8H, v31.8H, v7.H[0] // ..............................*............ + // add v31.8H, v3.8H, v20.8H // ......................*.................... + // add v25.8H, v18.8H, v31.8H // ...........................*............... + // sub v29.8H, v18.8H, v31.8H // ............................*.............. + // sub v28.8H, v9.8H, v14.8H // ..................................*........ + // mul v27.8H, v29.8H, v13.8H // ................................*.......... + // sqrdmulh v3.8H, v29.8H, v21.8H // .............................*............. + // sqrdmulh v26.8H, v28.8H, v21.8H // ....................................*...... + // mul v1.8H, v28.8H, v13.8H // .....................................*..... + // add v18.8H, v9.8H, v14.8H // ...................................*....... + // mls v1.8H, v26.8H, v7.H[0] // .......................................*... + // mls v27.8H, v3.8H, v7.H[0] // .................................*......... + // trn1 v8.4S, v25.4S, v18.4S // ..........................................* + // trn2 v23.4S, v25.4S, v18.4S // ......................................*.... + // trn1 v0.4S, v27.4S, v1.4S // .........................................*. + // trn2 v24.4S, v27.4S, v1.4S // ........................................*.. + // ldr q5, [x3], #16 // ......*.................................... + + sub count, count, #1 +layer4567_start: + ldr q13, [x4], #(6*16) // ............e...................................................................... + trn1 v18.2D, v8.2D, v0.2D // ............................................*...................................... + trn2 v11.2D, v8.2D, v0.2D // ..........................................*........................................ + ldr q27, [x1, #96] // ..e................................................................................ + ldr q1, [x1, #112] // ...e............................................................................... + // gap // ................................................................................... + trn1 v10.2D, v23.2D, v24.2D // .............................................*..................................... + trn2 v24.2D, v23.2D, v24.2D // ...........................................*....................................... + ldr q4, [x4, #-16] // .................e................................................................. + ldr q17, [x1, #64] // e.................................................................................. + ldr q29, [x1, #80] // .e................................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v19.8H, v11.8H, v24.8H // ....................................................*.............................. + add v22.8H, v11.8H, v24.8H // .....................................................*............................. + sub v15.8H, v18.8H, v10.8H // ...............................................*................................... + add v10.8H, v18.8H, v10.8H // ................................................*.................................. + ldr q2, [x4, #-32] // ................e.................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q21, [x4, #-80] // .............e..................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v23.8H, v15.8H, v5.H[3] // ..................................................*................................ + mul v6.8H, v15.8H, v5.H[2] // .................................................*................................. + sqrdmulh v9.8H, v19.8H, v5.H[5] // .......................................................*........................... + mul v24.8H, v19.8H, v5.H[4] // ......................................................*............................ + ldr q28, [x4, #-64] // ..............e.................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v3.4S, v17.4S, v29.4S // ....e.............................................................................. + trn1 v26.4S, v27.4S, v1.4S // ......e............................................................................ + ldr q25, [x4, #-48] // ...............e................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v0.8H, v10.8H, v7.H[1] // .........................................................*......................... + trn2 v20.4S, v27.4S, v1.4S // .......e........................................................................... + trn2 v29.4S, v17.4S, v29.4S // .....e............................................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v6.8H, v23.8H, v7.H[0] // ...................................................*............................... + trn1 v8.2D, v3.2D, v26.2D // ..........e........................................................................ + mls v24.8H, v9.8H, v7.H[0] // ........................................................*.......................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v16.8H, v0.8H, #11 // ..........................................................*........................ + trn2 v3.2D, v3.2D, v26.2D // ........e.......................................................................... + trn1 v1.2D, v29.2D, v20.2D // ...........e....................................................................... + trn2 v20.2D, v29.2D, v20.2D // .........e......................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v0.8H, v22.8H, v7.H[1] // ............................................................*...................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v27.8H, v3.8H, v20.8H // .......................e........................................................... + sqdmulh v29.8H, v24.8H, v7.H[1] // ..................................................................*................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v12.8H, v8.8H, v1.8H // ..................e................................................................ + sqdmulh v30.8H, v6.8H, v7.H[1] // ...............................................................*................... + mls v10.8H, v16.8H, v7.H[0] // ...........................................................*....................... + add v18.8H, v8.8H, v1.8H // ...................e............................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v17.8H, v12.8H, v25.8H // .....................e............................................................. + mul v14.8H, v27.8H, v2.8H // .........................e......................................................... + mul v9.8H, v12.8H, v28.8H // ....................e.............................................................. + sqrdmulh v31.8H, v27.8H, v4.8H // ..........................e........................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v23.8H, v30.8H, #11 // ................................................................*.................. + srshr v26.8H, v0.8H, #11 // .............................................................*..................... + srshr v0.8H, v29.8H, #11 // ...................................................................*............... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v9.8H, v17.8H, v7.H[0] // ......................e............................................................ + mls v14.8H, v31.8H, v7.H[0] // ...........................e....................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v31.8H, v3.8H, v20.8H // ........................e.......................................................... + mls v24.8H, v0.8H, v7.H[0] // ....................................................................*.............. + mls v22.8H, v26.8H, v7.H[0] // ..............................................................*.................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v6.8H, v23.8H, v7.H[0] // .................................................................*................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v25.8H, v18.8H, v31.8H // .............................e..................................................... + sub v29.8H, v18.8H, v31.8H // ............................e...................................................... + sub v28.8H, v9.8H, v14.8H // .................................e................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v16.8H, v6.8H, v24.8H // ...........................................................................*....... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v2.8H, v10.8H, v22.8H // ......................................................................*............ + sub v10.8H, v10.8H, v22.8H // .....................................................................*............. + sub v24.8H, v6.8H, v24.8H // ..........................................................................*........ + mul v27.8H, v29.8H, v13.8H // ..............................e.................................................... + sqrdmulh v3.8H, v29.8H, v21.8H // ...............................e................................................... + sqrdmulh v26.8H, v28.8H, v21.8H // ....................................e.............................................. + mul v1.8H, v28.8H, v13.8H // ...................................e............................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q16, [x1, #16] // ................................................................................*.. + mul v22.8H, v24.8H, v5.H[0] // ............................................................................*...... + sqrdmulh v11.8H, v24.8H, v5.H[1] // .............................................................................*..... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v0.8H, v10.8H, v5.H[1] // ........................................................................*.......... + mul v10.8H, v10.8H, v5.H[0] // .......................................................................*........... + str q2, [x1], #(64) // ...............................................................................*... + add v18.8H, v9.8H, v14.8H // ..................................e................................................ + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v1.8H, v26.8H, v7.H[0] // .....................................e............................................. + mls v27.8H, v3.8H, v7.H[0] // ................................e.................................................. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v22.8H, v11.8H, v7.H[0] // ..............................................................................*.... + mls v10.8H, v0.8H, v7.H[0] // .........................................................................*......... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v8.4S, v25.4S, v18.4S // ......................................e............................................ + trn2 v23.4S, v25.4S, v18.4S // .......................................e........................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v0.4S, v27.4S, v1.4S // ........................................e.......................................... + trn2 v24.4S, v27.4S, v1.4S // .........................................e......................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + str q22, [x1, #-16] // ..................................................................................* + str q10, [x1, #-32] // .................................................................................*. + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + ldr q5, [x3], #16 // ..............................................e.................................... + + // original source code + // ldr q8, [x1, #(16*0)] // ........e..........................................................................|.......e......................................................................... + // ldr q9, [x1, #(16*1)] // .........e.........................................................................|........e........................................................................ + // ldr q10, [x1, #(16*2)] // ...e...............................................................................|..e.............................................................................. + // ldr q11, [x1, #(16*3)] // ....e..............................................................................|...e............................................................................. + // trn1 v25.4s, v8.4s, v9.4s // .....................e.............................................................|....................e............................................................ + // trn2 v26.4s, v8.4s, v9.4s // ..........................e........................................................|.........................e....................................................... + // trn1 v27.4s, v10.4s, v11.4s // ......................e............................................................|.....................e........................................................... + // trn2 v28.4s, v10.4s, v11.4s // .........................e.........................................................|........................e........................................................ + // trn2 v10.2d, v25.2d, v27.2d // ...............................e...................................................|..............................e.................................................. + // trn2 v11.2d, v26.2d, v28.2d // .................................e.................................................|................................e................................................ + // trn1 v8.2d, v25.2d, v27.2d // ............................e......................................................|...........................e..................................................... + // trn1 v9.2d, v26.2d, v28.2d // ................................e..................................................|...............................e................................................. + // ldr q0, [x4], #(6*16) // e..................................................................................e................................................................................. + // ldr q4, [x4, #(-6*16 + 1*16)] // ...............e...................................................................|..............e.................................................................. + // ldr q1, [x4, #(-6*16 + 2*16)] // ....................e..............................................................|...................e............................................................. + // ldr q5, [x4, #(-6*16 + 3*16)] // .......................e...........................................................|......................e.......................................................... + // ldr q2, [x4, #(-6*16 + 4*16)] // ..............e....................................................................|.............e................................................................... + // ldr q6, [x4, #(-6*16 + 5*16)] // .......e...........................................................................|......e.......................................................................... + // sub v24.8h, v8.8h, v9.8h // .....................................e.............................................|....................................e............................................ + // add v8.8h, v8.8h, v9.8h // ........................................e..........................................|.......................................e......................................... + // mul v9.8h, v24.8h, v1.8h // ...........................................e.......................................|..........................................e...................................... + // sqrdmulh v24.8h, v24.8h, v5.8h // .........................................e.........................................|........................................e........................................ + // mls v9.8h, v24.8h, v7.h[0] // ................................................e..................................|...............................................e................................. + // sub v24.8h, v10.8h, v11.8h // ...................................e...............................................|..................................e.............................................. + // add v10.8h, v10.8h, v11.8h // ..................................................e................................|.................................................e............................... + // mul v11.8h, v24.8h, v2.8h // ..........................................e........................................|.........................................e....................................... + // sqrdmulh v24.8h, v24.8h, v6.8h // ............................................e......................................|...........................................e..................................... + // mls v11.8h, v24.8h, v7.h[0] // .................................................e.................................|................................................e................................ + // sub v24.8h, v8.8h, v10.8h // .......................................................e...........................|......................................................e.......................... + // add v8.8h, v8.8h, v10.8h // ......................................................e............................|.....................................................e........................... + // mul v10.8h, v24.8h, v0.8h // .............................................................e.....................|............................................................e.................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ..............................................................e....................|.............................................................e................... + // mls v10.8h, v24.8h, v7.h[0] // .........................................................................e.........|........................................................................e........ + // sub v24.8h, v9.8h, v11.8h // ........................................................e..........................|.......................................................e......................... + // add v9.8h, v9.8h, v11.8h // .......................................................................e...........|......................................................................e.......... + // mul v11.8h, v24.8h, v0.8h // ................................................................e..................|...............................................................e................. + // sqrdmulh v24.8h, v24.8h, v4.8h // ...............................................................e...................|..............................................................e.................. + // mls v11.8h, v24.8h, v7.h[0] // ........................................................................e..........|.......................................................................e......... + // trn1 v25.4s, v8.4s, v9.4s // ............................................................................e......|...........................................................................e..... + // trn2 v26.4s, v8.4s, v9.4s // .............................................................................e.....|............................................................................e.... + // trn1 v27.4s, v10.4s, v11.4s // ..............................................................................e....|.............................................................................e... + // trn2 v28.4s, v10.4s, v11.4s // ...............................................................................e...|..............................................................................e.. + // trn2 v10.2d, v25.2d, v27.2d // ..*................................................................................|.*............................................................................... + // trn2 v11.2d, v26.2d, v28.2d // ......*............................................................................|.....*........................................................................... + // trn1 v8.2d, v25.2d, v27.2d // .*.................................................................................|*................................................................................ + // trn1 v9.2d, v26.2d, v28.2d // .....*.............................................................................|....*............................................................................ + // ldr q0, [x3], #16 // ..................................................................................e|................................................................................. + // sub v24.8h, v8.8h, v9.8h // ............*......................................................................|...........*..................................................................... + // add v8.8h, v8.8h, v9.8h // .............*.....................................................................|............*.................................................................... + // mul v9.8h, v24.8h, v0.h[2] // .................*.................................................................|................*................................................................ + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ................*..................................................................|...............*................................................................. + // mls v9.8h, v24.8h, v7.h[0] // ...........................*.......................................................|..........................*...................................................... + // sub v24.8h, v10.8h, v11.8h // ..........*........................................................................|.........*....................................................................... + // add v10.8h, v10.8h, v11.8h // ...........*.......................................................................|..........*...................................................................... + // mul v11.8h, v24.8h, v0.h[4] // ...................*...............................................................|..................*.............................................................. + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ..................*................................................................|.................*............................................................... + // mls v11.8h, v24.8h, v7.h[0] // .............................*.....................................................|............................*.................................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ........................*..........................................................|.......................*......................................................... + // srshr v25.8h, v25.8h, #11 // ..............................*....................................................|.............................*................................................... + // mls v8.8h, v25.8h, v7.h[0] // .......................................*...........................................|......................................*.......................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ..................................*................................................|.................................*............................................... + // srshr v25.8h, v25.8h, #11 // ..............................................*....................................|.............................................*................................... + // mls v10.8h, v25.8h, v7.h[0] // ....................................................*..............................|...................................................*............................. + // sqdmulh v25.8h, v9.8h, v7.h[1] // ......................................*............................................|.....................................*........................................... + // srshr v25.8h, v25.8h, #11 // .............................................*.....................................|............................................*.................................... + // mls v9.8h, v25.8h, v7.h[0] // .....................................................*.............................|....................................................*............................ + // sqdmulh v25.8h, v11.8h, v7.h[1] // ....................................*..............................................|...................................*............................................. + // srshr v25.8h, v25.8h, #11 // ...............................................*...................................|..............................................*.................................. + // mls v11.8h, v25.8h, v7.h[0] // ...................................................*...............................|..................................................*.............................. + // sub v24.8h, v8.8h, v10.8h // ...........................................................*.......................|..........................................................*...................... + // add v8.8h, v8.8h, v10.8h // ..........................................................*........................|.........................................................*....................... + // mul v10.8h, v24.8h, v0.h[0] // .....................................................................*.............|....................................................................*............ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ....................................................................*..............|...................................................................*............. + // mls v10.8h, v24.8h, v7.h[0] // ...........................................................................*.......|..........................................................................*...... + // sub v24.8h, v9.8h, v11.8h // ............................................................*......................|...........................................................*..................... + // add v9.8h, v9.8h, v11.8h // .........................................................*.........................|........................................................*........................ + // mul v11.8h, v24.8h, v0.h[0] // ..................................................................*................|.................................................................*............... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ...................................................................*...............|..................................................................*.............. + // mls v11.8h, v24.8h, v7.h[0] // ..........................................................................*........|.........................................................................*....... + // str q8, [x1], #(64) // ......................................................................*............|.....................................................................*........... + // str q9, [x1, #(-64 + 16*1)] // .................................................................*.................|................................................................*................ + // str q10, [x1, #(-64 + 16*2)] // .................................................................................*.|................................................................................* + // str q11, [x1, #(-64 + 16*3)] // ................................................................................*..|...............................................................................*. + + sub count, count, #1 + cbnz count, layer4567_start + trn2 v1.2D, v8.2D, v0.2D // .*...................................... + trn1 v14.2D, v8.2D, v0.2D // *....................................... + trn2 v17.2D, v23.2D, v24.2D // ...*.................................... + trn1 v0.2D, v23.2D, v24.2D // ..*..................................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v12.8H, v1.8H, v17.8H // ....*................................... + sub v9.8H, v14.8H, v0.8H // ......*................................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v16.8H, v1.8H, v17.8H // .....*.................................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v11.8H, v12.8H, v5.H[5] // ..........*............................. + mul v19.8H, v12.8H, v5.H[4] // ...........*............................ + sqrdmulh v25.8H, v9.8H, v5.H[3] // ........*............................... + mul v30.8H, v9.8H, v5.H[2] // .........*.............................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + add v12.8H, v14.8H, v0.8H // .......*................................ + sqdmulh v2.8H, v16.8H, v7.H[1] // ................*....................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v19.8H, v11.8H, v7.H[0] // ..............*......................... + mls v30.8H, v25.8H, v7.H[0] // .............*.......................... + sqdmulh v25.8H, v12.8H, v7.H[1] // ............*........................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + srshr v15.8H, v2.8H, #11 // .....................*.................. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + srshr v31.8H, v25.8H, #11 // ...............*........................ + sqdmulh v22.8H, v19.8H, v7.H[1] // .................*...................... + sqdmulh v29.8H, v30.8H, v7.H[1] // ..................*..................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v16.8H, v15.8H, v7.H[0] // ........................*............... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v12.8H, v31.8H, v7.H[0] // ...................*.................... + srshr v25.8H, v22.8H, #11 // ......................*................. + srshr v20.8H, v29.8H, #11 // ....................*................... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v19.8H, v25.8H, v7.H[0] // .......................*................ + mls v30.8H, v20.8H, v7.H[0] // .........................*.............. + add v3.8H, v12.8H, v16.8H // ...........................*............ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v2.8H, v12.8H, v16.8H // ............................*........... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + str q3, [x1], #(64) // ...................................*.... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sub v21.8H, v30.8H, v19.8H // .............................*.......... + add v12.8H, v30.8H, v19.8H // ..........................*............. + mul v8.8H, v2.8H, v5.H[0] // ..................................*..... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + sqrdmulh v9.8H, v2.8H, v5.H[1] // .................................*...... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mul v31.8H, v21.8H, v5.H[0] // ...............................*........ + sqrdmulh v4.8H, v21.8H, v5.H[1] // ................................*....... + str q12, [x1, #-48] // ..............................*......... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v8.8H, v9.8H, v7.H[0] // .....................................*.. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + mls v31.8H, v4.8H, v7.H[0] // ....................................*... + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + str q8, [x1, #-32] // .......................................* + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + str q31, [x1, #-16] // ......................................*. + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + // gap // ........................................ + + // original source code + // trn1 v18.2D, v8.2D, v0.2D // .*...................................... + // trn2 v11.2D, v8.2D, v0.2D // *....................................... + // trn1 v10.2D, v23.2D, v24.2D // ...*.................................... + // trn2 v24.2D, v23.2D, v24.2D // ..*..................................... + // sub v19.8H, v11.8H, v24.8H // ....*................................... + // add v22.8H, v11.8H, v24.8H // ......*................................. + // sub v15.8H, v18.8H, v10.8H // .....*.................................. + // add v10.8H, v18.8H, v10.8H // ...........*............................ + // sqrdmulh v23.8H, v15.8H, v5.H[3] // .........*.............................. + // mul v6.8H, v15.8H, v5.H[2] // ..........*............................. + // sqrdmulh v9.8H, v19.8H, v5.H[5] // .......*................................ + // mul v24.8H, v19.8H, v5.H[4] // ........*............................... + // sqdmulh v0.8H, v10.8H, v7.H[1] // ...............*........................ + // mls v6.8H, v23.8H, v7.H[0] // ..............*......................... + // mls v24.8H, v9.8H, v7.H[0] // .............*.......................... + // srshr v16.8H, v0.8H, #11 // .................*...................... + // sqdmulh v0.8H, v22.8H, v7.H[1] // ............*........................... + // sqdmulh v29.8H, v24.8H, v7.H[1] // ..................*..................... + // sqdmulh v30.8H, v6.8H, v7.H[1] // ...................*.................... + // mls v10.8H, v16.8H, v7.H[0] // .....................*.................. + // srshr v23.8H, v30.8H, #11 // .......................*................ + // srshr v26.8H, v0.8H, #11 // ................*....................... + // srshr v0.8H, v29.8H, #11 // ......................*................. + // mls v24.8H, v0.8H, v7.H[0] // ........................*............... + // mls v22.8H, v26.8H, v7.H[0] // ....................*................... + // mls v6.8H, v23.8H, v7.H[0] // .........................*.............. + // add v16.8H, v6.8H, v24.8H // ..............................*......... + // add v2.8H, v10.8H, v22.8H // ..........................*............. + // sub v10.8H, v10.8H, v22.8H // ...........................*............ + // sub v24.8H, v6.8H, v24.8H // .............................*.......... + // str q16, [x1, #16] // ...................................*.... + // mul v22.8H, v24.8H, v5.H[0] // .................................*...... + // sqrdmulh v11.8H, v24.8H, v5.H[1] // ..................................*..... + // sqrdmulh v0.8H, v10.8H, v5.H[1] // ................................*....... + // mul v10.8H, v10.8H, v5.H[0] // ...............................*........ + // str q2, [x1], #(64) // ............................*........... + // mls v22.8H, v11.8H, v7.H[0] // .....................................*.. + // mls v10.8H, v0.8H, v7.H[0] // ....................................*... + // str q22, [x1, #-16] // .......................................* + // str q10, [x1, #-32] // ......................................*. + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q23, [x0, #128] // ..*................................................. + ldr q19, [x0, #192] // .*.................................................. + ldr q22, [x0, #0] // *................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + ldr q28, [x0, #64] // ....*............................................... + ldr q27, [x0, #384] // ...*................................................ + ldr q24, [x0, #448] // .....*.............................................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + ldr q3, [x0, #320] // ......*............................................. + ldr q26, [x0, #256] // .......*............................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v20.8H, v23.8H, v19.8H // .........*.......................................... + add v19.8H, v23.8H, v19.8H // ........*........................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v23.8H, v22.8H, v28.8H // ..........*......................................... + add v22.8H, v22.8H, v28.8H // ...........*........................................ + sub v28.8H, v27.8H, v24.8H // .............*...................................... + add v27.8H, v27.8H, v24.8H // ............*....................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v24.8H, v26.8H, v3.8H // ...............*.................................... + add v3.8H, v26.8H, v3.8H // .................*.................................. + mul v26.8H, v20.8H, v1.H[0] // ..............*..................................... + sqrdmulh v20.8H, v20.8H, v1.H[1] // ...................*................................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v11.8H, v23.8H, v0.H[7] // ....................*............................... + mul v23.8H, v23.8H, v0.H[6] // .....................*.............................. + sqrdmulh v14.8H, v28.8H, v1.H[5] // ................*................................... + mul v28.8H, v28.8H, v1.H[4] // ..................*................................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sqrdmulh v25.8H, v24.8H, v1.H[3] // ......................*............................. + mul v24.8H, v24.8H, v1.H[2] // .......................*............................ + sub v5.8H, v3.8H, v27.8H // ........................*........................... + sub v10.8H, v22.8H, v19.8H // .........................*.......................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + add v19.8H, v22.8H, v19.8H // ............................*....................... + add v22.8H, v3.8H, v27.8H // .............................*...................... + mls v26.8H, v20.8H, v7.H[0] // ...........................*........................ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v23.8H, v11.8H, v7.H[0] // ...............................*.................... + mls v28.8H, v14.8H, v7.H[0] // ..........................*......................... + sqrdmulh v3.8H, v5.8H, v0.H[5] // ................................*................... + sqrdmulh v20.8H, v10.8H, v0.H[3] // .................................*.................. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mls v24.8H, v25.8H, v7.H[0] // ..............................*..................... + mul v13.8H, v5.8H, v0.H[4] // ...................................*................ + mul v16.8H, v10.8H, v0.H[2] // ..................................*................. + add v27.8H, v19.8H, v22.8H // ....................................*............... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v25.8H, v19.8H, v22.8H // ...........................................*........ + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v11.8H, v23.8H, v26.8H // ......................................*............. + add v15.8H, v23.8H, v26.8H // ........................................*........... + sqrdmulh v22.8H, v27.8H, v30.8H // .................................................*.. + mul v19.8H, v27.8H, v29.8H // ..................................................*. + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + sub v23.8H, v24.8H, v28.8H // .......................................*............ + add v27.8H, v24.8H, v28.8H // .....................................*.............. + mls v13.8H, v3.8H, v7.H[0] // .........................................*.......... + mls v16.8H, v20.8H, v7.H[0] // ..........................................*......... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v4.8H, v11.8H, v0.H[2] // .............................................*...... + sqrdmulh v20.8H, v11.8H, v0.H[3] // ..............................................*..... + mul v5.8H, v25.8H, v0.H[0] // ................................................*... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + mul v10.8H, v23.8H, v0.H[4] // ............................................*....... + sqrdmulh v3.8H, v23.8H, v0.H[5] // ...............................................*.... + add v23.8H, v15.8H, v27.8H // ...................................................* + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + // gap // .................................................... + + // original source code + // ldr q21, [x0, #0] // ..*................................................. + // ldr q9, [x0, #192] // .*.................................................. + // ldr q16, [x0, #128] // *................................................... + // ldr q15, [x0, #384] // ....*............................................... + // ldr q8, [x0, #64] // ...*................................................ + // ldr q25, [x0, #448] // .....*.............................................. + // ldr q18, [x0, #320] // ......*............................................. + // ldr q6, [x0, #256] // .......*............................................ + // add v13.8H, v16.8H, v9.8H // .........*.......................................... + // sub v16.8H, v16.8H, v9.8H // ........*........................................... + // sub v2.8H, v21.8H, v8.8H // ..........*......................................... + // add v17.8H, v21.8H, v8.8H // ...........*........................................ + // add v11.8H, v15.8H, v25.8H // .............*...................................... + // sub v27.8H, v15.8H, v25.8H // ............*....................................... + // mul v15.8H, v16.8H, v1.H[0] // ................*................................... + // sub v9.8H, v6.8H, v18.8H // ..............*..................................... + // sqrdmulh v28.8H, v27.8H, v1.H[5] // ....................*............................... + // add v18.8H, v6.8H, v18.8H // ...............*.................................... + // mul v6.8H, v27.8H, v1.H[4] // .....................*.............................. + // sqrdmulh v5.8H, v16.8H, v1.H[1] // .................*.................................. + // sqrdmulh v16.8H, v2.8H, v0.H[7] // ..................*................................. + // mul v23.8H, v2.8H, v0.H[6] // ...................*................................ + // sqrdmulh v27.8H, v9.8H, v1.H[3] // ......................*............................. + // mul v2.8H, v9.8H, v1.H[2] // .......................*............................ + // sub v25.8H, v18.8H, v11.8H // ........................*........................... + // sub v9.8H, v17.8H, v13.8H // .........................*.......................... + // mls v6.8H, v28.8H, v7.H[0] // ..............................*..................... + // mls v15.8H, v5.8H, v7.H[0] // ............................*....................... + // add v5.8H, v17.8H, v13.8H // ..........................*......................... + // add v4.8H, v18.8H, v11.8H // ...........................*........................ + // mls v2.8H, v27.8H, v7.H[0] // .................................*.................. + // mls v23.8H, v16.8H, v7.H[0] // .............................*...................... + // sqrdmulh v11.8H, v25.8H, v0.H[5] // ...............................*.................... + // sqrdmulh v19.8H, v9.8H, v0.H[3] // ................................*................... + // mul v16.8H, v9.8H, v0.H[2] // ...................................*................ + // mul v13.8H, v25.8H, v0.H[4] // ..................................*................. + // add v24.8H, v5.8H, v4.8H // ....................................*............... + // add v27.8H, v2.8H, v6.8H // ...........................................*........ + // sub v14.8H, v23.8H, v15.8H // ......................................*............. + // sub v28.8H, v2.8H, v6.8H // ..........................................*......... + // add v15.8H, v23.8H, v15.8H // .......................................*............ + // mls v13.8H, v11.8H, v7.H[0] // ............................................*....... + // mls v16.8H, v19.8H, v7.H[0] // .............................................*...... + // sub v25.8H, v5.8H, v4.8H // .....................................*.............. + // mul v10.8H, v28.8H, v0.H[4] // .................................................*.. + // mul v4.8H, v14.8H, v0.H[2] // ..............................................*..... + // sqrdmulh v20.8H, v14.8H, v0.H[3] // ...............................................*.... + // sqrdmulh v3.8H, v28.8H, v0.H[5] // ..................................................*. + // mul v5.8H, v25.8H, v0.H[0] // ................................................*... + // sqrdmulh v22.8H, v24.8H, v30.8H // ........................................*........... + // mul v19.8H, v24.8H, v29.8H // .........................................*.......... + // add v23.8H, v15.8H, v27.8H // ...................................................* + + sub count, count, #1 +layer123_start: + sub v26.8H, v16.8H, v13.8H // ..........................................................*............................. + // gap // ........................................................................................ + ldr q21, [x0, #16] // e....................................................................................... + ldr q9, [x0, #208] // ...e.................................................................................... + sqrdmulh v11.8H, v25.8H, v0.H[1] // ...................................................*.................................... + sub v27.8H, v15.8H, v27.8H // .....................................................*.................................. + add v28.8H, v16.8H, v13.8H // ...........................................................*............................ + ldr q16, [x0, #144] // ..e..................................................................................... + ldr q15, [x0, #400] // ......e................................................................................. + // gap // ........................................................................................ + ldr q8, [x0, #80] // .e...................................................................................... + ldr q25, [x0, #464] // .......e................................................................................ + mls v4.8H, v20.8H, v7.H[0] // .....................................*.................................................. + mls v10.8H, v3.8H, v7.H[0] // ...............................................*........................................ + mul v31.8H, v23.8H, v29.8H // ...........................................................................*............ + sqrdmulh v17.8H, v23.8H, v30.8H // ............................................................................*........... + sqrdmulh v14.8H, v26.8H, v0.H[1] // .............................................................*.......................... + mul v12.8H, v26.8H, v0.H[0] // ............................................................*........................... + mul v26.8H, v27.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + ldr q18, [x0, #336] // .....e.................................................................................. + ldr q6, [x0, #272] // ....e................................................................................... + sqrdmulh v20.8H, v27.8H, v0.H[1] // ........................................................*............................... + // gap // ........................................................................................ + mls v19.8H, v22.8H, v7.H[0] // ..........................................................................*............. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v3.8H, v28.8H, v29.8H // ..............................................................................*......... + sqrdmulh v24.8H, v28.8H, v30.8H // ...............................................................................*........ + mls v5.8H, v11.8H, v7.H[0] // ....................................................*................................... + add v13.8H, v16.8H, v9.8H // ..............e......................................................................... + mls v31.8H, v17.8H, v7.H[0] // .............................................................................*.......... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v16.8H, v16.8H, v9.8H // .............e.......................................................................... + add v23.8H, v4.8H, v10.8H // ................................................................*....................... + sub v2.8H, v21.8H, v8.8H // ........e............................................................................... + add v17.8H, v21.8H, v8.8H // .........e.............................................................................. + add v11.8H, v15.8H, v25.8H // ........................e............................................................... + // gap // ........................................................................................ + sub v27.8H, v15.8H, v25.8H // .......................e................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v15.8H, v16.8H, v1.H[0] // ...............e........................................................................ + sub v9.8H, v6.8H, v18.8H // ..................e..................................................................... + str q19, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + // gap // ........................................................................................ + str q5, [x0, #240] // ....................................................................*................... + mul v22.8H, v23.8H, v29.8H // .................................................................................*...... + sqrdmulh v25.8H, v23.8H, v30.8H // ..................................................................................*..... + sqrdmulh v28.8H, v27.8H, v1.H[5] // ..........................e............................................................. + add v18.8H, v6.8H, v18.8H // ...................e.................................................................... + mul v6.8H, v27.8H, v1.H[4] // .........................e.............................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v5.8H, v16.8H, v1.H[1] // ................e....................................................................... + sqrdmulh v16.8H, v2.8H, v0.H[7] // ...........e............................................................................ + mul v23.8H, v2.8H, v0.H[6] // ..........e............................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v27.8H, v9.8H, v1.H[3] // .....................e.................................................................. + mul v2.8H, v9.8H, v1.H[2] // ....................e................................................................... + mls v22.8H, v25.8H, v7.H[0] // ...................................................................................*.... + sub v25.8H, v18.8H, v11.8H // ......................................e................................................. + sub v9.8H, v17.8H, v13.8H // ............................e........................................................... + sub v19.8H, v4.8H, v10.8H // ...............................................................*........................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v6.8H, v28.8H, v7.H[0] // ...........................e............................................................ + mls v15.8H, v5.8H, v7.H[0] // .................e...................................................................... + add v5.8H, v17.8H, v13.8H // .............................e.......................................................... + add v4.8H, v18.8H, v11.8H // .......................................e................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v28.8H, v19.8H, v0.H[1] // ..................................................................*..................... + mul v18.8H, v19.8H, v0.H[0] // .................................................................*...................... + mls v2.8H, v27.8H, v7.H[0] // ......................e................................................................. + mls v23.8H, v16.8H, v7.H[0] // ............e........................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v11.8H, v25.8H, v0.H[5] // .........................................e.............................................. + sqrdmulh v19.8H, v9.8H, v0.H[3] // ...............................e........................................................ + mul v16.8H, v9.8H, v0.H[2] // ..............................e......................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v13.8H, v25.8H, v0.H[4] // ........................................e............................................... + mls v3.8H, v24.8H, v7.H[0] // ................................................................................*....... + mls v12.8H, v14.8H, v7.H[0] // ..............................................................*......................... + mls v26.8H, v20.8H, v7.H[0] // .........................................................*.............................. + add v24.8H, v5.8H, v4.8H // .................................................e...................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + add v27.8H, v2.8H, v6.8H // ............................................e........................................... + mls v18.8H, v28.8H, v7.H[0] // ...................................................................*.................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q22, [x0, #176] // .......................................................................................* + sub v14.8H, v23.8H, v15.8H // .................................e...................................................... + sub v28.8H, v2.8H, v6.8H // ...........................................e............................................ + add v15.8H, v23.8H, v15.8H // ..................................e..................................................... + mls v13.8H, v11.8H, v7.H[0] // ..........................................e............................................. + mls v16.8H, v19.8H, v7.H[0] // ................................e....................................................... + sub v25.8H, v5.8H, v4.8H // ................................................e....................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + str q31, [x0, #48] // .....................................................................................*.. + // gap // ........................................................................................ + str q3, [x0, #112] // ......................................................................................*. + str q12, [x0, #368] // ......................................................................*................. + mul v10.8H, v28.8H, v0.H[4] // .............................................e.......................................... + // gap // ........................................................................................ + mul v4.8H, v14.8H, v0.H[2] // ...................................e.................................................... + sqrdmulh v20.8H, v14.8H, v0.H[3] // ....................................e................................................... + sqrdmulh v3.8H, v28.8H, v0.H[5] // ..............................................e......................................... + str q18, [x0, #432] // .......................................................................*................ + mul v5.8H, v25.8H, v0.H[0] // ..................................................e..................................... + sqrdmulh v22.8H, v24.8H, v30.8H // .........................................................................e.............. + mul v19.8H, v24.8H, v29.8H // ........................................................................e............... + str q26, [x0, #304] // .....................................................................*.................. + // gap // ........................................................................................ + // gap // ........................................................................................ + add v23.8H, v15.8H, v27.8H // ......................................................e................................. + + // original source code + // ldr q8, [x0, #0] // e......................................................................................|e..................................................................................... + // ldr q9, [x0, #(1*(512/8))] // .......e...............................................................................|.......e.............................................................................. + // ldr q10, [x0, #(2*(512/8))] // .....e.................................................................................|.....e................................................................................ + // ldr q11, [x0, #(3*(512/8))] // .e.....................................................................................|.e.................................................................................... + // ldr q12, [x0, #(4*(512/8))] // .................e.....................................................................|.................e.................................................................... + // ldr q13, [x0, #(5*(512/8))] // ................e......................................................................|................e..................................................................... + // ldr q14, [x0, #(6*(512/8))] // ......e................................................................................|......e............................................................................... + // ldr q15, [x0, #(7*(512/8))] // ........e..............................................................................|........e............................................................................. + // sub v24.8h, v8.8h, v9.8h // ...........................e...........................................................|...........................e.......................................................... + // add v8.8h, v8.8h, v9.8h // ............................e..........................................................|............................e......................................................... + // mul v9.8h, v24.8h, v0.h[6] // ..........................................e............................................|..........................................e........................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // .........................................e.............................................|.........................................e............................................ + // mls v9.8h, v24.8h, v7.h[0] // ........................................................e..............................|........................................................e............................. + // sub v24.8h, v10.8h, v11.8h // .........................e.............................................................|.........................e............................................................ + // add v10.8h, v10.8h, v11.8h // .......................e...............................................................|.......................e.............................................................. + // mul v11.8h, v24.8h, v1.h[0] // ...............................e.......................................................|...............................e...................................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ........................................e..............................................|........................................e............................................. + // mls v11.8h, v24.8h, v7.h[0] // ..................................................e....................................|..................................................e................................... + // sub v24.8h, v12.8h, v13.8h // ................................e......................................................|................................e..................................................... + // add v12.8h, v12.8h, v13.8h // ......................................e................................................|......................................e............................................... + // mul v13.8h, v24.8h, v1.h[2] // ............................................e..........................................|............................................e......................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ...........................................e...........................................|...........................................e.......................................... + // mls v13.8h, v24.8h, v7.h[0] // .......................................................e...............................|.......................................................e.............................. + // sub v24.8h, v14.8h, v15.8h // ..............................e........................................................|..............................e....................................................... + // add v14.8h, v14.8h, v15.8h // .............................e.........................................................|.............................e........................................................ + // mul v15.8h, v24.8h, v1.h[4] // .......................................e...............................................|.......................................e.............................................. + // sqrdmulh v24.8h, v24.8h, v1.h[5] // .....................................e.................................................|.....................................e................................................ + // mls v15.8h, v24.8h, v7.h[0] // .................................................e.....................................|.................................................e.................................... + // sub v24.8h, v8.8h, v10.8h // ...............................................e.......................................|...............................................e...................................... + // add v8.8h, v8.8h, v10.8h // ...................................................e...................................|...................................................e.................................. + // mul v10.8h, v24.8h, v0.h[2] // ...........................................................e...........................|...........................................................e.......................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ..........................................................e............................|..........................................................e........................... + // mls v10.8h, v24.8h, v7.h[0] // ........................................................................e..............|........................................................................e............. + // sub v24.8h, v9.8h, v11.8h // ....................................................................e..................|....................................................................e................. + // add v9.8h, v9.8h, v11.8h // ......................................................................e................|......................................................................e............... + // mul v11.8h, v24.8h, v0.h[2] // ..............................................................................e........|..............................................................................e....... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ...............................................................................e.......|...............................................................................e...... + // mls v11.8h, v24.8h, v7.h[0] // .........*.............................................................................|.........*............................................................................ + // sub v24.8h, v12.8h, v14.8h // ..............................................e........................................|..............................................e....................................... + // add v12.8h, v12.8h, v14.8h // ....................................................e..................................|....................................................e................................. + // mul v14.8h, v24.8h, v0.h[4] // ............................................................e..........................|............................................................e......................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .........................................................e.............................|.........................................................e............................ + // mls v14.8h, v24.8h, v7.h[0] // .......................................................................e...............|.......................................................................e.............. + // sub v24.8h, v13.8h, v15.8h // .....................................................................e.................|.....................................................................e................ + // add v13.8h, v13.8h, v15.8h // .................................................................e.....................|.................................................................e.................... + // mul v15.8h, v24.8h, v0.h[4] // .............................................................................e.........|.............................................................................e........ + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ................................................................................e......|................................................................................e..... + // mls v15.8h, v24.8h, v7.h[0] // ..........*............................................................................|..........*........................................................................... + // sub v24.8h, v8.8h, v12.8h // .........................................................................e.............|.........................................................................e............ + // add v8.8h, v8.8h, v12.8h // ................................................................e......................|................................................................e..................... + // mul v12.8h, v24.8h, v0.h[0] // ..................................................................................e....|..................................................................................e... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..*....................................................................................|..*................................................................................... + // mls v12.8h, v24.8h, v7.h[0] // ......................*................................................................|......................*............................................................... + // sub v24.8h, v9.8h, v13.8h // ...*...................................................................................|...*.................................................................................. + // add v9.8h, v9.8h, v13.8h // ......................................................................................e|...................................................................................... + // mul v13.8h, v24.8h, v0.h[0] // ...............*.......................................................................|...............*...................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................*....................................................................|..................*................................................................... + // mls v13.8h, v24.8h, v7.h[0] // ...............................................................*.......................|...............................................................*...................... + // sub v24.8h, v10.8h, v14.8h // .......................................................................................*...................................................................................... + // add v10.8h, v10.8h, v14.8h // ....*..................................................................................|....*................................................................................. + // mul v14.8h, v24.8h, v0.h[0] // ..............*........................................................................|..............*....................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .............*.........................................................................|.............*........................................................................ + // mls v14.8h, v24.8h, v7.h[0] // ..............................................................*........................|..............................................................*....................... + // sub v24.8h, v11.8h, v15.8h // ................................................*......................................|................................................*..................................... + // add v11.8h, v11.8h, v15.8h // ..........................*............................................................|..........................*........................................................... + // mul v15.8h, v24.8h, v0.h[0] // ......................................................*................................|......................................................*............................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .....................................................*.................................|.....................................................*................................ + // mls v15.8h, v24.8h, v7.h[0] // ..................................................................*....................|..................................................................*................... + // str q12, [x0, #(4*(512/8))] // ..................................*....................................................|..................................*................................................... + // str q13, [x0, #(5*(512/8))] // .....................................................................................*.|.....................................................................................* + // str q14, [x0, #(6*(512/8))] // ............................................................................*..........|............................................................................*......... + // str q15, [x0, #(7*(512/8))] // .................................................................................*.....|.................................................................................*.... + // mul v12.8h, v8.8h, v29.8h // ....................................................................................e..|....................................................................................e. + // sqrdmulh v8.8h, v8.8h, v30.8h // ...................................................................................e...|...................................................................................e.. + // mls v12.8h, v8.8h, v7.h[0] // ...................*...................................................................|...................*.................................................................. + // mul v13.8h, v9.8h, v29.8h // ...........*...........................................................................|...........*.......................................................................... + // sqrdmulh v9.8h, v9.8h, v30.8h // ............*..........................................................................|............*......................................................................... + // mls v13.8h, v9.8h, v7.h[0] // ........................*..............................................................|........................*............................................................. + // mul v14.8h, v10.8h, v29.8h // ....................*..................................................................|....................*................................................................. + // sqrdmulh v10.8h, v10.8h, v30.8h // .....................*.................................................................|.....................*................................................................ + // mls v14.8h, v10.8h, v7.h[0] // .............................................................*.........................|.............................................................*........................ + // mul v15.8h, v11.8h, v29.8h // ...................................*...................................................|...................................*.................................................. + // sqrdmulh v11.8h, v11.8h, v30.8h // ....................................*..................................................|....................................*................................................. + // mls v15.8h, v11.8h, v7.h[0] // .............................................*.........................................|.............................................*........................................ + // str q12, [x0], #(16) // .................................*.....................................................|.................................*.................................................... + // str q13, [x0, #(-16 + 1*(512/8))] // ..........................................................................*............|..........................................................................*........... + // str q14, [x0, #(-16 + 2*(512/8))] // ...........................................................................*...........|...........................................................................*.......... + // str q15, [x0, #(-16 + 3*(512/8))] // ...................................................................*...................|...................................................................*.................. + + sub count, count, #1 + cbnz count, layer123_start + mls v19.8H, v22.8H, v7.H[0] // ............*....................... + sub v24.8H, v15.8H, v27.8H // ..*................................. + mls v10.8H, v3.8H, v7.H[0] // .....*.............................. + mls v4.8H, v20.8H, v7.H[0] // ....*............................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + add v17.8H, v16.8H, v13.8H // ...*................................ + sqrdmulh v6.8H, v23.8H, v30.8H // .......*............................ + sub v21.8H, v16.8H, v13.8H // *................................... + mul v26.8H, v23.8H, v29.8H // ......*............................. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + sqrdmulh v16.8H, v25.8H, v0.H[1] // .*.................................. + mul v2.8H, v24.8H, v0.H[0] // ..........*......................... + sqrdmulh v20.8H, v24.8H, v0.H[1] // ...........*........................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q19, [x0], #(16) // ..................*................. + sqrdmulh v15.8H, v21.8H, v0.H[1] // ........*........................... + mul v31.8H, v21.8H, v0.H[0] // .........*.......................... + add v13.8H, v4.8H, v10.8H // .................*.................. + sub v4.8H, v4.8H, v10.8H // .......................*............ + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v26.8H, v6.8H, v7.H[0] // ................*................... + mul v27.8H, v17.8H, v29.8H // .............*...................... + sqrdmulh v17.8H, v17.8H, v30.8H // ..............*..................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mul v24.8H, v13.8H, v29.8H // ....................*............... + sqrdmulh v3.8H, v13.8H, v30.8H // .....................*.............. + sqrdmulh v25.8H, v4.8H, v0.H[1] // ........................*........... + mul v12.8H, v4.8H, v0.H[0] // .........................*.......... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v31.8H, v15.8H, v7.H[0] // ...........................*........ + mls v5.8H, v16.8H, v7.H[0] // ...............*.................... + mls v2.8H, v20.8H, v7.H[0] // ............................*....... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v27.8H, v17.8H, v7.H[0] // ..........................*......... + str q26, [x0, #48] // ...............................*.... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + mls v24.8H, v3.8H, v7.H[0] // ......................*............. + mls v12.8H, v25.8H, v7.H[0] // .............................*...... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q31, [x0, #368] // .................................*.. + str q5, [x0, #240] // ...................*................ + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q27, [x0, #112] // ................................*... + str q2, [x0, #304] // ...................................* + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + str q24, [x0, #176] // ..............................*..... + str q12, [x0, #432] // ..................................*. + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + // gap // .................................... + + // original source code + // sub v26.8H, v16.8H, v13.8H // ......*............................. + // sqrdmulh v11.8H, v25.8H, v0.H[1] // ........*........................... + // sub v27.8H, v15.8H, v27.8H // .*.................................. + // add v28.8H, v16.8H, v13.8H // ....*............................... + // mls v4.8H, v20.8H, v7.H[0] // ...*................................ + // mls v10.8H, v3.8H, v7.H[0] // ..*................................. + // mul v31.8H, v23.8H, v29.8H // .......*............................ + // sqrdmulh v17.8H, v23.8H, v30.8H // .....*.............................. + // sqrdmulh v14.8H, v26.8H, v0.H[1] // ............*....................... + // mul v12.8H, v26.8H, v0.H[0] // .............*...................... + // mul v26.8H, v27.8H, v0.H[0] // .........*.......................... + // sqrdmulh v20.8H, v27.8H, v0.H[1] // ..........*......................... + // mls v19.8H, v22.8H, v7.H[0] // *................................... + // mul v3.8H, v28.8H, v29.8H // .................*.................. + // sqrdmulh v24.8H, v28.8H, v30.8H // ..................*................. + // mls v5.8H, v11.8H, v7.H[0] // ........................*........... + // mls v31.8H, v17.8H, v7.H[0] // ................*................... + // add v23.8H, v4.8H, v10.8H // ..............*..................... + // str q19, [x0], #(16) // ...........*........................ + // str q5, [x0, #240] // ...............................*.... + // mul v22.8H, v23.8H, v29.8H // ...................*................ + // sqrdmulh v25.8H, v23.8H, v30.8H // ....................*............... + // mls v22.8H, v25.8H, v7.H[0] // ............................*....... + // sub v19.8H, v4.8H, v10.8H // ...............*.................... + // sqrdmulh v28.8H, v19.8H, v0.H[1] // .....................*.............. + // mul v18.8H, v19.8H, v0.H[0] // ......................*............. + // mls v3.8H, v24.8H, v7.H[0] // ..........................*......... + // mls v12.8H, v14.8H, v7.H[0] // .......................*............ + // mls v26.8H, v20.8H, v7.H[0] // .........................*.......... + // mls v18.8H, v28.8H, v7.H[0] // .............................*...... + // str q22, [x0, #176] // ..................................*. + // str q31, [x0, #48] // ...........................*........ + // str q3, [x0, #112] // ................................*... + // str q12, [x0, #368] // ..............................*..... + // str q18, [x0, #432] // ...................................* + // str q26, [x0, #304] // .................................*.. + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_icestorm.s b/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_icestorm.s new file mode 100644 index 00000000..bbadddf1 --- /dev/null +++ b/examples/opt/aarch64/intt_kyber_123_4567_opt_m1_icestorm.s @@ -0,0 +1,1440 @@ +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +// Needed to provide ASM_LOAD directive +#include + +// NOTE +// We use a lot of trivial macros to simplify the parsing burden for Slothy +// The macros are not unfolded by Slothy and thus interpreted as instructions, +// which are easier to parse due to e.g. the lack of size specifiers and simpler +// syntax for pre and post increment for loads and stores. + +// Eventually, NeLight should include a proper parser for AArch64, +// but for initial investigations, the below is enough. + +.macro ldr_vo vec, base, offset + ldr qform_\vec, [\base, #\offset] +.endm + +.macro ldr_vi vec, base, inc + ldr qform_\vec, [\base], #\inc +.endm + +.macro str_vo vec, base, offset + str qform_\vec, [\base, #\offset] +.endm +.macro str_vi vec, base, inc + str qform_\vec, [\base], #\inc +.endm + +.macro vqrdmulh d,a,b + sqrdmulh \d\().8h, \a\().8h, \b\().8h +.endm +.macro vmlsq d,a,b,i + mls \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqrdmulhq d,a,b,i + sqrdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vqdmulhq d,a,b,i + sqdmulh \d\().8h, \a\().8h, \b\().h[\i] +.endm +.macro vmulq d,a,b,i + mul \d\().8h, \a\().8h, \b\().h[\i] +.endm + +.macro mulmodq dst, src, const, idx0, idx1 + vmulq \dst, \src, \const, \idx0 + vqrdmulhq \src, \src, \const, \idx1 + vmlsq \dst, \src, consts, 0 +.endm + +.macro mulmod dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly a, b, root, idx0, idx1 + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmodq \b, tmp, \root, \idx0, \idx1 +.endm + +.macro mulmod_v dst, src, const, const_twisted + mul \dst\().8h, \src\().8h, \const\().8h + vqrdmulh \src, \src, \const_twisted + vmlsq \dst, \src, consts, 0 +.endm + +.macro gs_butterfly_v a, b, root, root_twisted + sub tmp.8h, \a\().8h, \b\().8h + add \a\().8h, \a\().8h, \b\().8h + mulmod \b, tmp, \root, \root_twisted +.endm + +.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 + mulmod \dst0, \src0, ninv, ninv_tw + mulmod \dst1, \src1, ninv, ninv_tw + mulmod \dst2, \src2, ninv, ninv_tw + mulmod \dst3, \src3, ninv, ninv_tw +.endm + +.macro barrett_reduce a + vqdmulhq t0, \a, consts, 1 + srshr t0.8h, t0.8h, #11 + vmlsq \a, t0, consts, 0 +.endm + +.macro load_roots_123 + ldr_vi root0, r_ptr0, 32 + ldr_vo root1, r_ptr0, -16 +.endm + +.macro load_next_roots_45 + ldr_vi root0, r_ptr0, 16 +.endm + +.macro load_next_roots_67 + ldr_vi root0, r_ptr1, (6*16) + ldr_vo root0_tw, r_ptr1, (-6*16 + 1*16) + ldr_vo root1, r_ptr1, (-6*16 + 2*16) + ldr_vo root1_tw, r_ptr1, (-6*16 + 3*16) + ldr_vo root2, r_ptr1, (-6*16 + 4*16) + ldr_vo root2_tw, r_ptr1, (-6*16 + 5*16) +.endm + +.macro transpose4 data + trn1 t0.4s, \data\()0.4s, \data\()1.4s + trn2 t1.4s, \data\()0.4s, \data\()1.4s + trn1 t2.4s, \data\()2.4s, \data\()3.4s + trn2 t3.4s, \data\()2.4s, \data\()3.4s + + trn2 \data\()2.2d, t0.2d, t2.2d + trn2 \data\()3.2d, t1.2d, t3.2d + trn1 \data\()0.2d, t0.2d, t2.2d + trn1 \data\()1.2d, t1.2d, t3.2d +.endm + +.macro transpose_single data_out, data_in + trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s + trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s + trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s + trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s +.endm + +.macro save_gprs // slothy:no-unfold + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + str x29, [sp, #16*5] +.endm + +.macro restore_gprs // slothy:no-unfold + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldr x29, [sp, #16*5] + add sp, sp, #(16*6) +.endm + +.macro save_vregs // slothy:no-unfold + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs // slothy:no-unfold + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +#define STACK_SIZE 16 +#define STACK0 0 + +.macro restore a, loc // slothy:no-unfold + ldr \a, [sp, #\loc\()] +.endm +.macro save loc, a // slothy:no-unfold + str \a, [sp, #\loc\()] +.endm +.macro push_stack // slothy:no-unfold + save_gprs + save_vregs + sub sp, sp, #STACK_SIZE +.endm + +.macro pop_stack // slothy:no-unfold + add sp, sp, #STACK_SIZE + restore_vregs + restore_gprs +.endm + +.data +.p2align 4 +roots: +#include "intt_kyber_123_45_67_twiddles.s" +.text + + .global intt_kyber_123_4567_opt_m1_icestorm + .global _intt_kyber_123_4567_opt_m1_icestorm + +.p2align 4 +const_addr: .short 3329 + .short 20159 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 +ninv_addr: .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 + .short 512 +ninv_tw_addr: .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + .short 5040 + +intt_kyber_123_4567_opt_m1_icestorm: +_intt_kyber_123_4567_opt_m1_icestorm: + push_stack + + in .req x0 + inp .req x1 + count .req x2 + r_ptr0 .req x3 + r_ptr1 .req x4 + xtmp .req x5 + + qform_v0 .req q0 + qform_v1 .req q1 + qform_v2 .req q2 + qform_v3 .req q3 + qform_v4 .req q4 + qform_v5 .req q5 + qform_v6 .req q6 + qform_v7 .req q7 + qform_v8 .req q8 + qform_v9 .req q9 + qform_v10 .req q10 + qform_v11 .req q11 + qform_v12 .req q12 + qform_v13 .req q13 + qform_v14 .req q14 + qform_v15 .req q15 + qform_v16 .req q16 + qform_v17 .req q17 + qform_v18 .req q18 + qform_v19 .req q19 + qform_v20 .req q20 + qform_v21 .req q21 + qform_v22 .req q22 + qform_v23 .req q23 + qform_v24 .req q24 + qform_v25 .req q25 + qform_v26 .req q26 + qform_v27 .req q27 + qform_v28 .req q28 + qform_v29 .req q29 + qform_v30 .req q30 + qform_v31 .req q31 + + data0 .req v8 + data1 .req v9 + data2 .req v10 + data3 .req v11 + data4 .req v12 + data5 .req v13 + data6 .req v14 + data7 .req v15 + + x_00 .req x10 + x_01 .req x11 + x_10 .req x12 + x_11 .req x13 + x_20 .req x14 + x_21 .req x15 + x_30 .req x16 + x_31 .req x17 + + xt_00 .req x_00 + xt_01 .req x_20 + xt_10 .req x_10 + xt_11 .req x_30 + xt_20 .req x_01 + xt_21 .req x_21 + xt_30 .req x_11 + xt_31 .req x_31 + + qform_data0 .req q8 + qform_data1 .req q9 + qform_data2 .req q10 + qform_data3 .req q11 + qform_data4 .req q12 + qform_data5 .req q13 + qform_data6 .req q14 + qform_data7 .req q15 + + root0 .req v0 + root1 .req v1 + root2 .req v2 + root0_tw .req v4 + root1_tw .req v5 + root2_tw .req v6 + + consts .req v7 + qform_consts .req q7 + + qform_root0 .req q0 + qform_root1 .req q1 + qform_root2 .req q2 + qform_root0_tw .req q4 + qform_root1_tw .req q5 + qform_root2_tw .req q6 + + tmp .req v24 + t0 .req v25 + t1 .req v26 + t2 .req v27 + t3 .req v28 + + ASM_LOAD(r_ptr0, roots_l34) + ASM_LOAD(r_ptr1, roots_l56) + + ASM_LOAD(xtmp, const_addr) + ld1 {consts.8h}, [xtmp] + + save STACK0, in + + mov inp, in + mov count, #8 + + .p2align 2 + ldr q19, [x1, #16] // ...*............................................. + ldr q1, [x1, #0] // ..*.............................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + ldr q0, [x1, #32] // .*............................................... + // gap // ................................................. + ldr q22, [x1, #48] // *................................................ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + ldr q6, [x3], #16 // .......................................*......... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + ldr q11, [x4, #48] // ........*........................................ + trn1 v27.4S, v1.4S, v19.4S // ...........*..................................... + trn2 v24.4S, v1.4S, v19.4S // .........*....................................... + // gap // ................................................. + trn2 v17.4S, v0.4S, v22.4S // .......*......................................... + // gap // ................................................. + ldr q19, [x4, #80] // .................*............................... + trn1 v8.4S, v0.4S, v22.4S // ......*.......................................... + // gap // ................................................. + // gap // ................................................. + ldr q0, [x4, #32] // .....*........................................... + // gap // ................................................. + trn1 v4.2D, v27.2D, v8.2D // ...............*................................. + // gap // ................................................. + // gap // ................................................. + trn1 v13.2D, v24.2D, v17.2D // ............*.................................... + ldr q2, [x4, #64] // ....*............................................ + trn2 v22.2D, v27.2D, v8.2D // ................*................................ + // gap // ................................................. + trn2 v12.2D, v24.2D, v17.2D // .............*................................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v5.8H, v4.8H, v13.8H // ..................*.............................. + add v28.8H, v4.8H, v13.8H // ....................*............................ + // gap // ................................................. + // gap // ................................................. + sub v17.8H, v22.8H, v12.8H // ...................*............................. + // gap // ................................................. + mul v23.8H, v5.8H, v0.8H // ......................*.......................... + // gap // ................................................. + sqrdmulh v10.8H, v5.8H, v11.8H // ........................*........................ + mul v30.8H, v17.8H, v2.8H // .......................*......................... + // gap // ................................................. + // gap // ................................................. + sqrdmulh v19.8H, v17.8H, v19.8H // .....................*........................... + add v16.8H, v22.8H, v12.8H // .........................*....................... + ldr q2, [x4, #16] // ..........*...................................... + // gap // ................................................. + // gap // ................................................. + mls v23.8H, v10.8H, v7.H[0] // ..........................*...................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mls v30.8H, v19.8H, v7.H[0] // ...........................*..................... + sub v11.8H, v28.8H, v16.8H // ............................*.................... + // gap // ................................................. + ldr q19, [x4], #(6*16) // ..............*.................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sqrdmulh v22.8H, v11.8H, v2.8H // .............................*................... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + sub v8.8H, v23.8H, v30.8H // ..............................*.................. + mul v21.8H, v11.8H, v19.8H // ...............................*................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + mul v3.8H, v8.8H, v19.8H // .................................*............... + sqrdmulh v13.8H, v8.8H, v2.8H // ................................*................ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + add v8.8H, v23.8H, v30.8H // ...................................*............. + add v19.8H, v28.8H, v16.8H // ..................................*.............. + // gap // ................................................. + // gap // ................................................. + mls v21.8H, v22.8H, v7.H[0] // .....................................*........... + // gap // ................................................. + // gap // ................................................. + mls v3.8H, v13.8H, v7.H[0] // ....................................*............ + trn1 v5.4S, v19.4S, v8.4S // ......................................*.......... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v8.4S, v19.4S, v8.4S // ........................................*........ + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn1 v13.4S, v21.4S, v3.4S // ..........................................*...... + // gap // ................................................. + // gap // ................................................. + trn2 v30.4S, v21.4S, v3.4S // .........................................*....... + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + // gap // ................................................. + trn2 v14.2D, v8.2D, v30.2D // ............................................*.... + // gap // ................................................. + // gap // ................................................. + trn2 v19.2D, v5.2D, v13.2D // ...........................................*..... + trn1 v22.2D, v8.2D, v30.2D // .............................................*... + trn1 v28.2D, v5.2D, v13.2D // ..............................................*.. + // gap // ................................................. + // gap // ................................................. + add v3.8H, v19.8H, v14.8H // ...............................................*. + // gap // ................................................. + // gap // ................................................. + sub v19.8H, v19.8H, v14.8H // ................................................* + + // original source code + // ldr q30, [x1, #48] // ...*............................................. + // ldr q12, [x1, #32] // ..*.............................................. + // ldr q26, [x1, #0] // .*............................................... + // ldr q21, [x1, #16] // *................................................ + // ldr q29, [x4, #64] // ..............*.................................. + // ldr q2, [x4, #32] // ...........*..................................... + // trn1 v13.4S, v12.4S, v30.4S // ..........*...................................... + // trn2 v19.4S, v12.4S, v30.4S // ........*........................................ + // ldr q14, [x4, #48] // .....*........................................... + // trn2 v9.4S, v26.4S, v21.4S // .......*......................................... + // ldr q31, [x4, #16] // .........................*....................... + // trn1 v11.4S, v26.4S, v21.4S // ......*.......................................... + // trn1 v5.2D, v9.2D, v19.2D // .............*................................... + // trn2 v8.2D, v9.2D, v19.2D // ................*................................ + // ldr q9, [x4], #(6*16) // .............................*................... + // trn1 v12.2D, v11.2D, v13.2D // ............*.................................... + // trn2 v11.2D, v11.2D, v13.2D // ...............*................................. + // ldr q15, [x4, #-16] // .........*....................................... + // sub v19.8H, v12.8H, v5.8H // .................*............................... + // sub v26.8H, v11.8H, v8.8H // ...................*............................. + // add v5.8H, v12.8H, v5.8H // ..................*.............................. + // sqrdmulh v0.8H, v26.8H, v15.8H // .......................*......................... + // mul v10.8H, v19.8H, v2.8H // ....................*............................ + // mul v29.8H, v26.8H, v29.8H // ......................*.......................... + // sqrdmulh v14.8H, v19.8H, v14.8H // .....................*........................... + // add v11.8H, v11.8H, v8.8H // ........................*........................ + // mls v10.8H, v14.8H, v7.H[0] // ..........................*...................... + // mls v29.8H, v0.8H, v7.H[0] // ...........................*..................... + // sub v0.8H, v5.8H, v11.8H // ............................*.................... + // sqrdmulh v23.8H, v0.8H, v31.8H // ..............................*.................. + // sub v16.8H, v10.8H, v29.8H // ...............................*................. + // mul v25.8H, v0.8H, v9.8H // ................................*................ + // sqrdmulh v28.8H, v16.8H, v31.8H // ..................................*.............. + // mul v20.8H, v16.8H, v9.8H // .................................*............... + // add v5.8H, v5.8H, v11.8H // ....................................*............ + // add v14.8H, v10.8H, v29.8H // ...................................*............. + // mls v20.8H, v28.8H, v7.H[0] // ......................................*.......... + // mls v25.8H, v23.8H, v7.H[0] // .....................................*........... + // trn1 v29.4S, v5.4S, v14.4S // .......................................*......... + // ldr q6, [x3], #16 // ....*............................................ + // trn2 v26.4S, v5.4S, v14.4S // ........................................*........ + // trn2 v19.4S, v25.4S, v20.4S // ..........................................*...... + // trn1 v20.4S, v25.4S, v20.4S // .........................................*....... + // trn2 v27.2D, v29.2D, v20.2D // ............................................*.... + // trn2 v0.2D, v26.2D, v19.2D // ...........................................*..... + // trn1 v22.2D, v26.2D, v19.2D // .............................................*... + // trn1 v28.2D, v29.2D, v20.2D // ..............................................*.. + // add v3.8H, v27.8H, v0.8H // ...............................................*. + // sub v19.8H, v27.8H, v0.8H // ................................................* + + sub count, count, #1 +layer4567_start: + add v25.8H, v28.8H, v22.8H // ................................................*.................................. + ldr q30, [x1, #112] // ...e............................................................................... + sub v23.8H, v28.8H, v22.8H // ...............................................*................................... + ldr q12, [x1, #96] // ..e................................................................................ + sqrdmulh v20.8H, v19.8H, v6.H[5] // .......................................................*........................... + ldr q26, [x1, #64] // e.................................................................................. + ldr q21, [x1, #80] // .e................................................................................. + mul v24.8H, v19.8H, v6.H[4] // ......................................................*............................ + ldr q29, [x4, #64] // ................e.................................................................. + sqdmulh v22.8H, v3.8H, v7.H[1] // ............................................................*...................... + mul v4.8H, v23.8H, v6.H[2] // .................................................*................................. + // gap // ................................................................................... + ldr q2, [x4, #32] // ..............e.................................................................... + sqdmulh v28.8H, v25.8H, v7.H[1] // .........................................................*......................... + // gap // ................................................................................... + sqrdmulh v23.8H, v23.8H, v6.H[3] // ..................................................*................................ + trn1 v13.4S, v12.4S, v30.4S // ......e............................................................................ + trn2 v19.4S, v12.4S, v30.4S // .......e........................................................................... + ldr q14, [x4, #48] // ...............e................................................................... + // gap // ................................................................................... + trn2 v9.4S, v26.4S, v21.4S // .....e............................................................................. + ldr q31, [x4, #16] // .............e..................................................................... + // gap // ................................................................................... + srshr v22.8H, v22.8H, #11 // .............................................................*..................... + trn1 v11.4S, v26.4S, v21.4S // ....e.............................................................................. + // gap // ................................................................................... + srshr v28.8H, v28.8H, #11 // ..........................................................*........................ + // gap // ................................................................................... + trn1 v5.2D, v9.2D, v19.2D // ...........e....................................................................... + trn2 v8.2D, v9.2D, v19.2D // .........e......................................................................... + ldr q9, [x4], #(6*16) // ............e...................................................................... + // gap // ................................................................................... + trn1 v12.2D, v11.2D, v13.2D // ..........e........................................................................ + trn2 v11.2D, v11.2D, v13.2D // ........e.......................................................................... + // gap // ................................................................................... + ldr q15, [x4, #-16] // .................e................................................................. + mls v3.8H, v22.8H, v7.H[0] // ..............................................................*.................... + mls v25.8H, v28.8H, v7.H[0] // ...........................................................*....................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v19.8H, v12.8H, v5.8H // ..................e................................................................ + sub v26.8H, v11.8H, v8.8H // .......................e........................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v5.8H, v12.8H, v5.8H // ...................e............................................................... + mls v4.8H, v23.8H, v7.H[0] // ...................................................*............................... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v0.8H, v26.8H, v15.8H // ..........................e........................................................ + mul v10.8H, v19.8H, v2.8H // ....................e.............................................................. + // gap // ................................................................................... + // gap // ................................................................................... + mul v29.8H, v26.8H, v29.8H // .........................e......................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v24.8H, v20.8H, v7.H[0] // ........................................................*.......................... + sqdmulh v23.8H, v4.8H, v7.H[1] // ...............................................................*................... + sub v22.8H, v25.8H, v3.8H // .....................................................................*............. + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v14.8H, v19.8H, v14.8H // .....................e............................................................. + add v27.8H, v25.8H, v3.8H // ......................................................................*............ + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v3.8H, v22.8H, v6.H[1] // ........................................................................*.......... + // gap // ................................................................................... + // gap // ................................................................................... + sqdmulh v19.8H, v24.8H, v7.H[1] // ..................................................................*................ + srshr v23.8H, v23.8H, #11 // ................................................................*.................. + str q27, [x1], #(64) // ...............................................................................*... + add v11.8H, v11.8H, v8.8H // ........................e.......................................................... + // gap // ................................................................................... + mls v10.8H, v14.8H, v7.H[0] // ......................e............................................................ + mls v29.8H, v0.8H, v7.H[0] // ...........................e....................................................... + // gap // ................................................................................... + // gap // ................................................................................... + sub v0.8H, v5.8H, v11.8H // ............................e...................................................... + // gap // ................................................................................... + // gap // ................................................................................... + srshr v19.8H, v19.8H, #11 // ...................................................................*............... + mls v4.8H, v23.8H, v7.H[0] // .................................................................*................. + mul v27.8H, v22.8H, v6.H[0] // .......................................................................*........... + // gap // ................................................................................... + // gap // ................................................................................... + sqrdmulh v23.8H, v0.8H, v31.8H // ...............................e................................................... + sub v16.8H, v10.8H, v29.8H // .................................e................................................. + // gap // ................................................................................... + // gap // ................................................................................... + mul v25.8H, v0.8H, v9.8H // ..............................e.................................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v24.8H, v19.8H, v7.H[0] // ....................................................................*.............. + sqrdmulh v28.8H, v16.8H, v31.8H // ....................................e.............................................. + mul v20.8H, v16.8H, v9.8H // ...................................e............................................... + // gap // ................................................................................... + // gap // ................................................................................... + mls v27.8H, v3.8H, v7.H[0] // .........................................................................*......... + add v5.8H, v5.8H, v11.8H // .............................e..................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v14.8H, v10.8H, v29.8H // ..................................e................................................ + // gap // ................................................................................... + // gap // ................................................................................... + sub v19.8H, v4.8H, v24.8H // ..........................................................................*........ + mls v20.8H, v28.8H, v7.H[0] // .....................................e............................................. + mls v25.8H, v23.8H, v7.H[0] // ................................e.................................................. + // gap // ................................................................................... + // gap // ................................................................................... + str q27, [x1, #-32] // .................................................................................*. + // gap // ................................................................................... + mul v23.8H, v19.8H, v6.H[0] // ............................................................................*...... + sqrdmulh v22.8H, v19.8H, v6.H[1] // .............................................................................*..... + trn1 v29.4S, v5.4S, v14.4S // ......................................e............................................ + // gap // ................................................................................... + ldr q6, [x3], #16 // ..............................................e.................................... + trn2 v26.4S, v5.4S, v14.4S // .......................................e........................................... + trn2 v19.4S, v25.4S, v20.4S // .........................................e......................................... + trn1 v20.4S, v25.4S, v20.4S // ........................................e.......................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + // gap // ................................................................................... + add v11.8H, v4.8H, v24.8H // ...........................................................................*....... + mls v23.8H, v22.8H, v7.H[0] // ..............................................................................*.... + trn2 v27.2D, v29.2D, v20.2D // ..........................................e........................................ + trn2 v0.2D, v26.2D, v19.2D // ...........................................e....................................... + // gap // ................................................................................... + // gap // ................................................................................... + trn1 v22.2D, v26.2D, v19.2D // .............................................e..................................... + trn1 v28.2D, v29.2D, v20.2D // ............................................e...................................... + // gap // ................................................................................... + str q11, [x1, #-48] // ................................................................................*.. + // gap // ................................................................................... + str q23, [x1, #-16] // ..................................................................................* + add v3.8H, v27.8H, v0.8H // .....................................................e............................. + sub v19.8H, v27.8H, v0.8H // ....................................................e.............................. + + // original source code + // ldr q8, [x1, #(16*0)] // ....e.............................................................................|....e........................................................................... + // ldr q9, [x1, #(16*1)] // .....e............................................................................|.....e.......................................................................... + // ldr q10, [x1, #(16*2)] // ..e...............................................................................|..e............................................................................. + // ldr q11, [x1, #(16*3)] // e.................................................................................|e............................................................................... + // trn1 v25.4s, v8.4s, v9.4s // ...................e..............................................................|...................e............................................................ + // trn2 v26.4s, v8.4s, v9.4s // ................e.................................................................|................e............................................................... + // trn1 v27.4s, v10.4s, v11.4s // .............e....................................................................|.............e.................................................................. + // trn2 v28.4s, v10.4s, v11.4s // ..............e...................................................................|..............e................................................................. + // trn2 v10.2d, v25.2d, v27.2d // .........................e........................................................|.........................e...................................................... + // trn2 v11.2d, v26.2d, v28.2d // ......................e...........................................................|......................e......................................................... + // trn1 v8.2d, v25.2d, v27.2d // ........................e.........................................................|........................e....................................................... + // trn1 v9.2d, v26.2d, v28.2d // .....................e............................................................|.....................e.......................................................... + // ldr q0, [x4], #(6*16) // .......................e..........................................................|.......................e........................................................ + // ldr q4, [x4, #(-6*16 + 1*16)] // .................e................................................................|.................e.............................................................. + // ldr q1, [x4, #(-6*16 + 2*16)] // ..........e.......................................................................|..........e..................................................................... + // ldr q5, [x4, #(-6*16 + 3*16)] // ...............e..................................................................|...............e................................................................ + // ldr q2, [x4, #(-6*16 + 4*16)] // .......e..........................................................................|.......e........................................................................ + // ldr q6, [x4, #(-6*16 + 5*16)] // ..........................e.......................................................|..........................e..................................................... + // sub v24.8h, v8.8h, v9.8h // .............................e....................................................|.............................e.................................................. + // add v8.8h, v8.8h, v9.8h // ...............................e..................................................|...............................e................................................ + // mul v9.8h, v24.8h, v1.8h // ..................................e...............................................|..................................e............................................. + // sqrdmulh v24.8h, v24.8h, v5.8h // .......................................e..........................................|.......................................e........................................ + // mls v9.8h, v24.8h, v7.h[0] // ..............................................e...................................|..............................................e................................. + // sub v24.8h, v10.8h, v11.8h // ..............................e...................................................|..............................e................................................. + // add v10.8h, v10.8h, v11.8h // .............................................e....................................|.............................................e.................................. + // mul v11.8h, v24.8h, v2.8h // ...................................e..............................................|...................................e............................................ + // sqrdmulh v24.8h, v24.8h, v6.8h // .................................e................................................|.................................e.............................................. + // mls v11.8h, v24.8h, v7.h[0] // ...............................................e..................................|...............................................e................................ + // sub v24.8h, v8.8h, v10.8h // ................................................e.................................|................................................e............................... + // add v8.8h, v8.8h, v10.8h // ...........................................................e......................|...........................................................e.................... + // mul v10.8h, v24.8h, v0.8h // ......................................................e...........................|......................................................e......................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ....................................................e.............................|....................................................e........................... + // mls v10.8h, v24.8h, v7.h[0] // ...............................................................e..................|...............................................................e................ + // sub v24.8h, v9.8h, v11.8h // .....................................................e............................|.....................................................e.......................... + // add v9.8h, v9.8h, v11.8h // ............................................................e.....................|............................................................e................... + // mul v11.8h, v24.8h, v0.8h // .........................................................e........................|.........................................................e...................... + // sqrdmulh v24.8h, v24.8h, v4.8h // ........................................................e.........................|........................................................e....................... + // mls v11.8h, v24.8h, v7.h[0] // ..............................................................e...................|..............................................................e................. + // trn1 v25.4s, v8.4s, v9.4s // ...................................................................e..............|...................................................................e............ + // trn2 v26.4s, v8.4s, v9.4s // .....................................................................e............|.....................................................................e.......... + // trn1 v27.4s, v10.4s, v11.4s // .......................................................................e..........|.......................................................................e........ + // trn2 v28.4s, v10.4s, v11.4s // ......................................................................e...........|......................................................................e......... + // trn2 v10.2d, v25.2d, v27.2d // ..........................................................................e.......|..........................................................................e..... + // trn2 v11.2d, v26.2d, v28.2d // ...........................................................................e......|...........................................................................e.... + // trn1 v8.2d, v25.2d, v27.2d // .............................................................................e....|.............................................................................e.. + // trn1 v9.2d, v26.2d, v28.2d // ............................................................................e.....|............................................................................e... + // ldr q0, [x3], #16 // ....................................................................e.............|....................................................................e........... + // sub v24.8h, v8.8h, v9.8h // .*................................................................................|.*.............................................................................. + // add v8.8h, v8.8h, v9.8h // ..................................................................................*................................................................................ + // mul v9.8h, v24.8h, v0.h[2] // .........*........................................................................|.........*...................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............*.....................................................................|............*................................................................... + // mls v9.8h, v24.8h, v7.h[0] // ................................*.................................................|................................*............................................... + // sub v24.8h, v10.8h, v11.8h // .................................................................................e|................................................................................ + // add v10.8h, v10.8h, v11.8h // ................................................................................e.|................................................................................ + // mul v11.8h, v24.8h, v0.h[4] // ......*...........................................................................|......*......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // ...*..............................................................................|...*............................................................................ + // mls v11.8h, v24.8h, v7.h[0] // ....................................*.............................................|....................................*........................................... + // sqdmulh v25.8h, v8.8h, v7.h[1] // ...........*......................................................................|...........*.................................................................... + // srshr v25.8h, v25.8h, #11 // ....................*.............................................................|....................*........................................................... + // mls v8.8h, v25.8h, v7.h[0] // ............................*.....................................................|............................*................................................... + // sqdmulh v25.8h, v10.8h, v7.h[1] // ........*.........................................................................|........*....................................................................... + // srshr v25.8h, v25.8h, #11 // ..................*...............................................................|..................*............................................................. + // mls v10.8h, v25.8h, v7.h[0] // ...........................*......................................................|...........................*.................................................... + // sqdmulh v25.8h, v9.8h, v7.h[1] // .....................................*............................................|.....................................*.......................................... + // srshr v25.8h, v25.8h, #11 // ...........................................*......................................|...........................................*.................................... + // mls v9.8h, v25.8h, v7.h[0] // ..................................................*...............................|..................................................*............................. + // sqdmulh v25.8h, v11.8h, v7.h[1] // ..........................................*.......................................|..........................................*..................................... + // srshr v25.8h, v25.8h, #11 // .................................................*................................|.................................................*.............................. + // mls v11.8h, v25.8h, v7.h[0] // .......................................................*..........................|.......................................................*........................ + // sub v24.8h, v8.8h, v10.8h // ......................................*...........................................|......................................*......................................... + // add v8.8h, v8.8h, v10.8h // ........................................*.........................................|........................................*....................................... + // mul v10.8h, v24.8h, v0.h[0] // ...................................................*..............................|...................................................*............................ + // sqrdmulh v24.8h, v24.8h, v0.h[1] // .........................................*........................................|.........................................*...................................... + // mls v10.8h, v24.8h, v7.h[0] // ..........................................................*.......................|..........................................................*..................... + // sub v24.8h, v9.8h, v11.8h // .............................................................*....................|.............................................................*.................. + // add v9.8h, v9.8h, v11.8h // ........................................................................*.........|........................................................................*....... + // mul v11.8h, v24.8h, v0.h[0] // .................................................................*................|.................................................................*.............. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..................................................................*...............|..................................................................*............. + // mls v11.8h, v24.8h, v7.h[0] // .........................................................................*........|.........................................................................*...... + // str q8, [x1], #(64) // ............................................*.....................................|............................................*................................... + // str q9, [x1, #(-64 + 16*1)] // ..............................................................................*...|..............................................................................*. + // str q10, [x1, #(-64 + 16*2)] // ................................................................*.................|................................................................*............... + // str q11, [x1, #(-64 + 16*3)] // ...............................................................................*..|...............................................................................* + + sub count, count, #1 + cbnz count, layer4567_start + sub v27.8H, v28.8H, v22.8H // .*................................ + // gap // .................................. + mul v13.8H, v19.8H, v6.H[4] // ...*.............................. + // gap // .................................. + sqrdmulh v19.8H, v19.8H, v6.H[5] // ..*............................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + mul v1.8H, v27.8H, v6.H[2] // .....*............................ + sqrdmulh v5.8H, v27.8H, v6.H[3] // .......*.......................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + add v28.8H, v28.8H, v22.8H // *................................. + // gap // .................................. + // gap // .................................. + mls v13.8H, v19.8H, v7.H[0] // .............*.................... + // gap // .................................. + sqdmulh v26.8H, v3.8H, v7.H[1] // ....*............................. + mls v1.8H, v5.8H, v7.H[0] // ............*..................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sqdmulh v22.8H, v28.8H, v7.H[1] // ......*........................... + // gap // .................................. + // gap // .................................. + sqdmulh v31.8H, v13.8H, v7.H[1] // ..................*............... + // gap // .................................. + srshr v30.8H, v26.8H, #11 // ........*......................... + // gap // .................................. + sqdmulh v19.8H, v1.8H, v7.H[1] // ..............*................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + srshr v0.8H, v22.8H, #11 // .........*........................ + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v3.8H, v30.8H, v7.H[0] // ..........*....................... + srshr v30.8H, v19.8H, #11 // ...................*.............. + srshr v19.8H, v31.8H, #11 // .....................*............ + // gap // .................................. + // gap // .................................. + mls v28.8H, v0.8H, v7.H[0] // ...........*...................... + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v1.8H, v30.8H, v7.H[0] // ......................*........... + // gap // .................................. + mls v13.8H, v19.8H, v7.H[0] // ........................*......... + // gap // .................................. + sub v19.8H, v28.8H, v3.8H // ...............*.................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + sub v23.8H, v1.8H, v13.8H // ..........................*....... + sqrdmulh v30.8H, v19.8H, v6.H[1] // .................*................ + // gap // .................................. + // gap // .................................. + mul v4.8H, v19.8H, v6.H[0] // .......................*.......... + // gap // .................................. + // gap // .................................. + // gap // .................................. + mul v27.8H, v23.8H, v6.H[0] // ............................*..... + sqrdmulh v21.8H, v23.8H, v6.H[1] // .............................*.... + // gap // .................................. + // gap // .................................. + add v3.8H, v28.8H, v3.8H // ................*................. + // gap // .................................. + // gap // .................................. + // gap // .................................. + mls v4.8H, v30.8H, v7.H[0] // .........................*........ + add v20.8H, v1.8H, v13.8H // ..............................*... + // gap // .................................. + // gap // .................................. + mls v27.8H, v21.8H, v7.H[0] // ...............................*.. + str q3, [x1], #(64) // ....................*............. + // gap // .................................. + // gap // .................................. + str q20, [x1, #-48] // ................................*. + // gap // .................................. + // gap // .................................. + // gap // .................................. + str q4, [x1, #-32] // ...........................*...... + // gap // .................................. + // gap // .................................. + // gap // .................................. + str q27, [x1, #-16] // .................................* + // gap // .................................. + // gap // .................................. + // gap // .................................. + + // original source code + // add v25.8H, v28.8H, v22.8H // .....*............................ + // sub v23.8H, v28.8H, v22.8H // *................................. + // sqrdmulh v20.8H, v19.8H, v6.H[5] // ..*............................... + // mul v24.8H, v19.8H, v6.H[4] // .*................................ + // sqdmulh v22.8H, v3.8H, v7.H[1] // .......*.......................... + // mul v4.8H, v23.8H, v6.H[2] // ...*.............................. + // sqdmulh v28.8H, v25.8H, v7.H[1] // .........*........................ + // sqrdmulh v23.8H, v23.8H, v6.H[3] // ....*............................. + // srshr v22.8H, v22.8H, #11 // ...........*...................... + // srshr v28.8H, v28.8H, #11 // .............*.................... + // mls v3.8H, v22.8H, v7.H[0] // ..............*................... + // mls v25.8H, v28.8H, v7.H[0] // .................*................ + // mls v4.8H, v23.8H, v7.H[0] // ........*......................... + // mls v24.8H, v20.8H, v7.H[0] // ......*........................... + // sqdmulh v23.8H, v4.8H, v7.H[1] // ............*..................... + // sub v22.8H, v25.8H, v3.8H // ....................*............. + // add v27.8H, v25.8H, v3.8H // ..........................*....... + // sqrdmulh v3.8H, v22.8H, v6.H[1] // ......................*........... + // sqdmulh v19.8H, v24.8H, v7.H[1] // ..........*....................... + // srshr v23.8H, v23.8H, #11 // ...............*.................. + // str q27, [x1], #(64) // ..............................*... + // srshr v19.8H, v19.8H, #11 // ................*................. + // mls v4.8H, v23.8H, v7.H[0] // ..................*............... + // mul v27.8H, v22.8H, v6.H[0] // .......................*.......... + // mls v24.8H, v19.8H, v7.H[0] // ...................*.............. + // mls v27.8H, v3.8H, v7.H[0] // ...........................*...... + // sub v19.8H, v4.8H, v24.8H // .....................*............ + // str q27, [x1, #-32] // ................................*. + // mul v23.8H, v19.8H, v6.H[0] // ........................*......... + // sqrdmulh v22.8H, v19.8H, v6.H[1] // .........................*........ + // add v11.8H, v4.8H, v24.8H // ............................*..... + // mls v23.8H, v22.8H, v7.H[0] // .............................*.... + // str q11, [x1, #-48] // ...............................*.. + // str q23, [x1, #-16] // .................................* + + + // --------------------------------------------------------------------- + + ninv .req v29 + ninv_tw .req v30 + + ASM_LOAD(xtmp, ninv_addr) + ld1r {ninv.8h}, [xtmp] + ASM_LOAD(xtmp, ninv_tw_addr) + ld1r {ninv_tw.8h}, [xtmp] + + mov count, #4 + ASM_LOAD(r_ptr0, roots_l012) + load_roots_123 + + .p2align 2 + + ldr q10, [x0, #256] // *.................................... + ldr q23, [x0, #320] // .....*............................... + // gap // ..................................... + // gap // ..................................... + ldr q21, [x0, #384] // ......*.............................. + ldr q3, [x0, #448] // ..*.................................. + // gap // ..................................... + // gap // ..................................... + ldr q6, [x0, #0] // ....*................................ + ldr q9, [x0, #64] // .*................................... + // gap // ..................................... + // gap // ..................................... + ldr q15, [x0, #192] // ...*................................. + ldr q25, [x0, #128] // .......*............................. + // gap // ..................................... + // gap // ..................................... + sub v17.8H, v10.8H, v23.8H // .........*........................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sub v22.8H, v21.8H, v3.8H // ............*........................ + add v19.8H, v21.8H, v3.8H // ..........................*.......... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v27.8H, v17.8H, v1.H[3] // .............*....................... + sub v13.8H, v6.8H, v9.8H // ........*............................ + // gap // ..................................... + // gap // ..................................... + sub v2.8H, v25.8H, v15.8H // ..............*...................... + mul v24.8H, v22.8H, v1.H[4] // ................*.................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + sqrdmulh v31.8H, v22.8H, v1.H[5] // ...............*..................... + mul v22.8H, v13.8H, v0.H[6] // ...........*......................... + mul v28.8H, v17.8H, v1.H[2] // .................*................... + sqrdmulh v3.8H, v2.8H, v1.H[1] // ...................*................. + // gap // ..................................... + // gap // ..................................... + mul v16.8H, v2.8H, v1.H[0] // ......................*.............. + sqrdmulh v13.8H, v13.8H, v0.H[7] // ..........*.......................... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v24.8H, v31.8H, v7.H[0] // ....................*................ + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + mls v28.8H, v27.8H, v7.H[0] // .....................*............... + // gap // ..................................... + // gap // ..................................... + mls v22.8H, v13.8H, v7.H[0] // ..................*.................. + mls v16.8H, v3.8H, v7.H[0] // ........................*............ + // gap // ..................................... + // gap // ..................................... + add v23.8H, v10.8H, v23.8H // .............................*....... + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + add v5.8H, v28.8H, v24.8H // .......................*............. + sub v12.8H, v28.8H, v24.8H // .........................*........... + // gap // ..................................... + // gap // ..................................... + add v31.8H, v22.8H, v16.8H // ...........................*......... + sub v20.8H, v23.8H, v19.8H // ................................*.... + // gap // ..................................... + // gap // ..................................... + add v13.8H, v23.8H, v19.8H // ..................................*.. + // gap // ..................................... + // gap // ..................................... + // gap // ..................................... + add v10.8H, v31.8H, v5.8H // ............................*........ + sub v27.8H, v31.8H, v5.8H // .................................*... + // gap // ..................................... + // gap // ..................................... + mul v14.8H, v12.8H, v0.H[4] // ....................................* + // gap // ..................................... + // gap // ..................................... + sqrdmulh v11.8H, v20.8H, v0.H[5] // ...................................*. + mul v4.8H, v10.8H, v29.8H // ..............................*...... + sqrdmulh v19.8H, v10.8H, v30.8H // ...............................*..... + // gap // ..................................... + // gap // ..................................... + + // original source code + // ldr q10, [x0, #256] // *.................................... + // ldr q9, [x0, #64] // .....*............................... + // ldr q17, [x0, #448] // ...*................................. + // ldr q15, [x0, #192] // ......*.............................. + // ldr q6, [x0, #0] // ....*................................ + // ldr q28, [x0, #320] // .*................................... + // ldr q2, [x0, #384] // ..*.................................. + // ldr q25, [x0, #128] // .......*............................. + // sub v22.8H, v6.8H, v9.8H // ............*........................ + // sub v4.8H, v10.8H, v28.8H // ........*............................ + // sqrdmulh v8.8H, v22.8H, v0.H[7] // ....................*................ + // mul v22.8H, v22.8H, v0.H[6] // ................*.................... + // sub v23.8H, v2.8H, v17.8H // .........*........................... + // sqrdmulh v12.8H, v4.8H, v1.H[3] // ...........*......................... + // sub v26.8H, v25.8H, v15.8H // .............*....................... + // sqrdmulh v24.8H, v23.8H, v1.H[5] // ...............*..................... + // mul v31.8H, v23.8H, v1.H[4] // ..............*...................... + // mul v21.8H, v4.8H, v1.H[2] // .................*................... + // mls v22.8H, v8.8H, v7.H[0] // .......................*............. + // sqrdmulh v8.8H, v26.8H, v1.H[1] // ..................*.................. + // mls v31.8H, v24.8H, v7.H[0] // .....................*............... + // mls v21.8H, v12.8H, v7.H[0] // ......................*.............. + // mul v16.8H, v26.8H, v1.H[0] // ...................*................. + // add v27.8H, v21.8H, v31.8H // ..........................*.......... + // mls v16.8H, v8.8H, v7.H[0] // ........................*............ + // sub v12.8H, v21.8H, v31.8H // ...........................*......... + // add v31.8H, v2.8H, v17.8H // ..........*.......................... + // add v21.8H, v22.8H, v16.8H // ............................*........ + // add v23.8H, v21.8H, v27.8H // ...............................*..... + // add v26.8H, v10.8H, v28.8H // .........................*........... + // mul v4.8H, v23.8H, v29.8H // ...................................*. + // sqrdmulh v19.8H, v23.8H, v30.8H // ....................................* + // sub v20.8H, v26.8H, v31.8H // .............................*....... + // sub v27.8H, v21.8H, v27.8H // ................................*.... + // add v13.8H, v26.8H, v31.8H // ..............................*...... + // sqrdmulh v11.8H, v20.8H, v0.H[5] // ..................................*.. + // mul v14.8H, v12.8H, v0.H[4] // .................................*... + + sub count, count, #1 +layer123_start: + ldr q10, [x0, #272] // ....e................................................................................... + sub v3.8H, v22.8H, v16.8H // .................................*...................................................... + add v22.8H, v6.8H, v9.8H // .........*.............................................................................. + ldr q9, [x0, #80] // .e...................................................................................... + ldr q17, [x0, #464] // .......e................................................................................ + add v18.8H, v25.8H, v15.8H // ..............*......................................................................... + ldr q15, [x0, #208] // ...e.................................................................................... + sqrdmulh v5.8H, v12.8H, v0.H[5] // ..............................................*......................................... + ldr q6, [x0, #16] // e....................................................................................... + mls v4.8H, v19.8H, v7.H[0] // .............................................................................*.......... + sqrdmulh v19.8H, v27.8H, v0.H[1] // ........................................................*............................... + ldr q28, [x0, #336] // .....e.................................................................................. + add v16.8H, v22.8H, v18.8H // .............................*.......................................................... + mul v23.8H, v27.8H, v0.H[0] // .......................................................*................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v14.8H, v5.8H, v7.H[0] // ...............................................*........................................ + sub v27.8H, v22.8H, v18.8H // ............................*........................................................... + // gap // ........................................................................................ + ldr q2, [x0, #400] // ......e................................................................................. + mul v5.8H, v20.8H, v0.H[4] // ........................................*............................................... + ldr q25, [x0, #144] // ..e..................................................................................... + mul v18.8H, v3.8H, v0.H[2] // ...................................*.................................................... + str q4, [x0, #64] // .....................................................................................*.. + sqrdmulh v3.8H, v3.8H, v0.H[3] // ....................................*................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v22.8H, v6.8H, v9.8H // ........e............................................................................... + sub v4.8H, v10.8H, v28.8H // ..................e..................................................................... + mls v23.8H, v19.8H, v7.H[0] // .........................................................*.............................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v5.8H, v11.8H, v7.H[0] // ..........................................*............................................. + sqrdmulh v8.8H, v22.8H, v0.H[7] // ...........e............................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v18.8H, v3.8H, v7.H[0] // .....................................*.................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v20.8H, v27.8H, v0.H[3] // ...............................*........................................................ + str q23, [x0, #320] // .....................................................................*.................. + sub v19.8H, v16.8H, v13.8H // ................................................*....................................... + mul v22.8H, v22.8H, v0.H[6] // ..........e............................................................................. + // gap // ........................................................................................ + sub v23.8H, v2.8H, v17.8H // .......................e................................................................ + // gap // ........................................................................................ + mul v3.8H, v27.8H, v0.H[2] // ..............................*......................................................... + // gap // ........................................................................................ + sqrdmulh v12.8H, v4.8H, v1.H[3] // .....................e.................................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + sub v26.8H, v25.8H, v15.8H // .............e.......................................................................... + sqrdmulh v24.8H, v23.8H, v1.H[5] // ..........................e............................................................. + mul v31.8H, v23.8H, v1.H[4] // .........................e.............................................................. + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v3.8H, v20.8H, v7.H[0] // ................................*....................................................... + mul v21.8H, v4.8H, v1.H[2] // ....................e................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + add v11.8H, v16.8H, v13.8H // .................................................*...................................... + mls v22.8H, v8.8H, v7.H[0] // ............e........................................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v8.8H, v26.8H, v1.H[1] // ................e....................................................................... + // gap // ........................................................................................ + mls v31.8H, v24.8H, v7.H[0] // ...........................e............................................................ + mls v21.8H, v12.8H, v7.H[0] // ......................e................................................................. + mul v16.8H, v26.8H, v1.H[0] // ...............e........................................................................ + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v23.8H, v19.8H, v0.H[1] // ...................................................*.................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v4.8H, v19.8H, v0.H[0] // ..................................................*..................................... + // gap // ........................................................................................ + sub v19.8H, v3.8H, v5.8H // ..........................................................*............................. + add v3.8H, v3.8H, v5.8H // ...........................................................*............................ + // gap // ........................................................................................ + add v27.8H, v21.8H, v31.8H // ............................................e........................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v16.8H, v8.8H, v7.H[0] // .................e...................................................................... + sub v24.8H, v18.8H, v14.8H // ...............................................................*........................ + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v8.8H, v19.8H, v0.H[0] // ............................................................*........................... + add v5.8H, v18.8H, v14.8H // ................................................................*....................... + // gap // ........................................................................................ + sub v12.8H, v21.8H, v31.8H // ...........................................e............................................ + // gap // ........................................................................................ + add v31.8H, v2.8H, v17.8H // ........................e............................................................... + // gap // ........................................................................................ + mls v4.8H, v23.8H, v7.H[0] // ....................................................*................................... + // gap // ........................................................................................ + sqrdmulh v17.8H, v5.8H, v30.8H // ..................................................................................*..... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v2.8H, v5.8H, v29.8H // .................................................................................*...... + add v21.8H, v22.8H, v16.8H // ..................................e..................................................... + // gap // ........................................................................................ + // gap // ........................................................................................ + mul v14.8H, v3.8H, v29.8H // ..............................................................................*......... + sqrdmulh v5.8H, v19.8H, v0.H[1] // .............................................................*.......................... + str q4, [x0, #256] // ....................................................................*................... + sqrdmulh v20.8H, v24.8H, v0.H[1] // ..................................................................*..................... + // gap // ........................................................................................ + mul v13.8H, v24.8H, v0.H[0] // .................................................................*...................... + mul v24.8H, v11.8H, v29.8H // ........................................................................*............... + // gap // ........................................................................................ + // gap // ........................................................................................ + sqrdmulh v19.8H, v11.8H, v30.8H // .........................................................................*.............. + // gap // ........................................................................................ + // gap // ........................................................................................ + mls v2.8H, v17.8H, v7.H[0] // ...................................................................................*.... + // gap // ........................................................................................ + sqrdmulh v3.8H, v3.8H, v30.8H // ...............................................................................*........ + mls v8.8H, v5.8H, v7.H[0] // ..............................................................*......................... + // gap // ........................................................................................ + add v23.8H, v21.8H, v27.8H // ......................................................e................................. + // gap // ........................................................................................ + mls v13.8H, v20.8H, v7.H[0] // ...................................................................*.................... + // gap // ........................................................................................ + mls v24.8H, v19.8H, v7.H[0] // ..........................................................................*............. + str q2, [x0, #192] // .......................................................................................* + add v26.8H, v10.8H, v28.8H // ...................e.................................................................... + // gap // ........................................................................................ + str q8, [x0, #384] // ......................................................................*................. + mul v4.8H, v23.8H, v29.8H // ...........................................................................e............ + mls v14.8H, v3.8H, v7.H[0] // ................................................................................*....... + // gap // ........................................................................................ + sqrdmulh v19.8H, v23.8H, v30.8H // ............................................................................e........... + sub v20.8H, v26.8H, v31.8H // ......................................e................................................. + // gap // ........................................................................................ + str q13, [x0, #448] // .......................................................................*................ + sub v27.8H, v21.8H, v27.8H // .....................................................e.................................. + add v13.8H, v26.8H, v31.8H // .......................................e................................................ + str q24, [x0], #(16) // ....................................................................................*... + // gap // ........................................................................................ + sqrdmulh v11.8H, v20.8H, v0.H[5] // .........................................e.............................................. + str q14, [x0, #112] // ......................................................................................*. + mul v14.8H, v12.8H, v0.H[4] // .............................................e.......................................... + // gap // ........................................................................................ + + // original source code + // ldr q8, [x0, #0] // ........e...............................................................................|.......e.............................................................................. + // ldr q9, [x0, #(1*(512/8))] // ...e....................................................................................|..e................................................................................... + // ldr q10, [x0, #(2*(512/8))] // ..................e.....................................................................|.................e.................................................................... + // ldr q11, [x0, #(3*(512/8))] // ......e.................................................................................|.....e................................................................................ + // ldr q12, [x0, #(4*(512/8))] // e.......................................................................................e...................................................................................... + // ldr q13, [x0, #(5*(512/8))] // ...........e............................................................................|..........e........................................................................... + // ldr q14, [x0, #(6*(512/8))] // ................e.......................................................................|...............e...................................................................... + // ldr q15, [x0, #(7*(512/8))] // ....e...................................................................................|...e.................................................................................. + // sub v24.8h, v8.8h, v9.8h // ......................e.................................................................|.....................e................................................................ + // add v8.8h, v8.8h, v9.8h // ..*.....................................................................................|.*.................................................................................... + // mul v9.8h, v24.8h, v0.h[6] // ...............................e........................................................|..............................e....................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[7] // ..........................e.............................................................|.........................e............................................................ + // mls v9.8h, v24.8h, v7.h[0] // .........................................e..............................................|........................................e............................................. + // sub v24.8h, v10.8h, v11.8h // ...................................e....................................................|..................................e................................................... + // add v10.8h, v10.8h, v11.8h // .....*..................................................................................|....*................................................................................. + // mul v11.8h, v24.8h, v1.h[0] // .............................................e..........................................|............................................e......................................... + // sqrdmulh v24.8h, v24.8h, v1.h[1] // ..........................................e.............................................|.........................................e............................................ + // mls v11.8h, v24.8h, v7.h[0] // ...................................................e....................................|..................................................e................................... + // sub v24.8h, v12.8h, v13.8h // .......................e................................................................|......................e............................................................... + // add v12.8h, v12.8h, v13.8h // ...........................................................................e............|..........................................................................e........... + // mul v13.8h, v24.8h, v1.h[2] // .......................................e................................................|......................................e............................................... + // sqrdmulh v24.8h, v24.8h, v1.h[3] // ..................................e.....................................................|.................................e.................................................... + // mls v13.8h, v24.8h, v7.h[0] // ............................................e...........................................|...........................................e.......................................... + // sub v24.8h, v14.8h, v15.8h // ................................e.......................................................|...............................e...................................................... + // add v14.8h, v14.8h, v15.8h // ........................................................e...............................|.......................................................e.............................. + // mul v15.8h, v24.8h, v1.h[4] // .....................................e..................................................|....................................e................................................. + // sqrdmulh v24.8h, v24.8h, v1.h[5] // ....................................e...................................................|...................................e.................................................. + // mls v15.8h, v24.8h, v7.h[0] // ...........................................e............................................|..........................................e........................................... + // sub v24.8h, v8.8h, v10.8h // ...............*........................................................................|..............*....................................................................... + // add v8.8h, v8.8h, v10.8h // ............*...........................................................................|...........*.......................................................................... + // mul v10.8h, v24.8h, v0.h[2] // .................................*......................................................|................................*..................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // ............................*...........................................................|...........................*.......................................................... + // mls v10.8h, v24.8h, v7.h[0] // ......................................*.................................................|.....................................*................................................ + // sub v24.8h, v9.8h, v11.8h // .*......................................................................................|*..................................................................................... + // add v9.8h, v9.8h, v11.8h // ............................................................e...........................|...........................................................e.......................... + // mul v11.8h, v24.8h, v0.h[2] // ...................*....................................................................|..................*................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[3] // .....................*..................................................................|....................*................................................................. + // mls v11.8h, v24.8h, v7.h[0] // ...........................*............................................................|..........................*........................................................... + // sub v24.8h, v12.8h, v14.8h // ................................................................................e.......|...............................................................................e...... + // add v12.8h, v12.8h, v14.8h // ...................................................................................e....|..................................................................................e... + // mul v14.8h, v24.8h, v0.h[4] // .................*......................................................................|................*..................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .....................................................................................e..|....................................................................................e. + // mls v14.8h, v24.8h, v7.h[0] // .........................*..............................................................|........................*............................................................. + // sub v24.8h, v13.8h, v15.8h // .......................................................e................................|......................................................e............................... + // add v13.8h, v13.8h, v15.8h // ..................................................e.....................................|.................................................e.................................... + // mul v15.8h, v24.8h, v0.h[4] // .......................................................................................e|...................................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[5] // .......*................................................................................|......*............................................................................... + // mls v15.8h, v24.8h, v7.h[0] // ..............*.........................................................................|.............*........................................................................ + // sub v24.8h, v8.8h, v12.8h // ..............................*.........................................................|.............................*........................................................ + // add v8.8h, v8.8h, v12.8h // ........................................*...............................................|.......................................*.............................................. + // mul v12.8h, v24.8h, v0.h[0] // ...............................................*........................................|..............................................*....................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................*.........................................|.............................................*........................................ + // mls v12.8h, v24.8h, v7.h[0] // .........................................................*..............................|........................................................*............................. + // sub v24.8h, v9.8h, v13.8h // ..................................................................................e.....|.................................................................................e.... + // add v9.8h, v9.8h, v13.8h // .......................................................................e................|......................................................................e............... + // mul v13.8h, v24.8h, v0.h[0] // .............*..........................................................................|............*......................................................................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..........*.............................................................................|.........*............................................................................ + // mls v13.8h, v24.8h, v7.h[0] // ........................*...............................................................|.......................*.............................................................. + // sub v24.8h, v10.8h, v14.8h // ................................................*.......................................|...............................................*...................................... + // add v10.8h, v10.8h, v14.8h // .................................................*......................................|................................................*..................................... + // mul v14.8h, v24.8h, v0.h[0] // .....................................................*..................................|....................................................*................................. + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ..............................................................*.........................|.............................................................*........................ + // mls v14.8h, v24.8h, v7.h[0] // ......................................................................*.................|.....................................................................*................ + // sub v24.8h, v11.8h, v15.8h // ....................................................*...................................|...................................................*.................................. + // add v11.8h, v11.8h, v15.8h // ......................................................*.................................|.....................................................*................................ + // mul v15.8h, v24.8h, v0.h[0] // .................................................................*......................|................................................................*..................... + // sqrdmulh v24.8h, v24.8h, v0.h[1] // ................................................................*.......................|...............................................................*...................... + // mls v15.8h, v24.8h, v7.h[0] // ........................................................................*...............|.......................................................................*.............. + // str q12, [x0, #(4*(512/8))] // ...............................................................*........................|..............................................................*....................... + // str q13, [x0, #(5*(512/8))] // .............................*..........................................................|............................*......................................................... + // str q14, [x0, #(6*(512/8))] // ............................................................................*...........|...........................................................................*.......... + // str q15, [x0, #(7*(512/8))] // .................................................................................*......|................................................................................*..... + // mul v12.8h, v8.8h, v29.8h // ..................................................................*.....................|.................................................................*.................... + // sqrdmulh v8.8h, v8.8h, v30.8h // ...................................................................*....................|..................................................................*................... + // mls v12.8h, v8.8h, v7.h[0] // .........................................................................*..............|........................................................................*............. + // mul v13.8h, v9.8h, v29.8h // .............................................................................e..........|............................................................................e......... + // sqrdmulh v9.8h, v9.8h, v30.8h // ...............................................................................e........|..............................................................................e....... + // mls v13.8h, v9.8h, v7.h[0] // .........*..............................................................................|........*............................................................................. + // mul v14.8h, v10.8h, v29.8h // .............................................................*..........................|............................................................*......................... + // sqrdmulh v10.8h, v10.8h, v30.8h // .....................................................................*..................|....................................................................*................. + // mls v14.8h, v10.8h, v7.h[0] // ..............................................................................*.........|.............................................................................*........ + // mul v15.8h, v11.8h, v29.8h // ...........................................................*............................|..........................................................*........................... + // sqrdmulh v11.8h, v11.8h, v30.8h // ..........................................................*.............................|.........................................................*............................ + // mls v15.8h, v11.8h, v7.h[0] // ....................................................................*...................|...................................................................*.................. + // str q12, [x0], #(16) // ....................................................................................*...|...................................................................................*.. + // str q13, [x0, #(-16 + 1*(512/8))] // ....................*...................................................................|...................*.................................................................. + // str q14, [x0, #(-16 + 2*(512/8))] // ......................................................................................*.|.....................................................................................* + // str q15, [x0, #(-16 + 3*(512/8))] // ..........................................................................*.............|.........................................................................*............ + + sub count, count, #1 + cbnz count, layer123_start + sub v23.8H, v22.8H, v16.8H // *.................................................. + add v22.8H, v6.8H, v9.8H // .*................................................. + // gap // ................................................... + // gap // ................................................... + sqrdmulh v24.8H, v12.8H, v0.H[5] // ...*............................................... + mul v3.8H, v20.8H, v0.H[4] // ..........*........................................ + // gap // ................................................... + // gap // ................................................... + add v26.8H, v25.8H, v15.8H // ..*................................................ + sqrdmulh v20.8H, v27.8H, v0.H[1] // .....*............................................. + // gap // ................................................... + // gap // ................................................... + mul v28.8H, v27.8H, v0.H[0] // .......*........................................... + mls v4.8H, v19.8H, v7.H[0] // ....*.............................................. + // gap // ................................................... + // gap // ................................................... + mls v3.8H, v11.8H, v7.H[0] // ...............*................................... + mls v14.8H, v24.8H, v7.H[0] // ........*.......................................... + // gap // ................................................... + // gap // ................................................... + add v19.8H, v22.8H, v26.8H // ......*............................................ + sqrdmulh v27.8H, v23.8H, v0.H[3] // .............*..................................... + // gap // ................................................... + // gap // ................................................... + mul v23.8H, v23.8H, v0.H[2] // ...........*....................................... + sub v22.8H, v22.8H, v26.8H // .........*......................................... + str q4, [x0, #64] // ............*...................................... + // gap // ................................................... + sub v24.8H, v19.8H, v13.8H // ...................*............................... + add v19.8H, v19.8H, v13.8H // ......................*............................ + // gap // ................................................... + // gap // ................................................... + mls v28.8H, v20.8H, v7.H[0] // ..............*.................................... + sqrdmulh v26.8H, v22.8H, v0.H[3] // .................*................................. + // gap // ................................................... + // gap // ................................................... + mls v23.8H, v27.8H, v7.H[0] // ................*.................................. + mul v22.8H, v22.8H, v0.H[2] // ....................*.............................. + // gap // ................................................... + // gap // ................................................... + sqrdmulh v27.8H, v24.8H, v0.H[1] // .......................*........................... + mul v24.8H, v24.8H, v0.H[0] // ........................*.......................... + // gap // ................................................... + // gap // ................................................... + mul v20.8H, v19.8H, v29.8H // ......................................*............ + sqrdmulh v19.8H, v19.8H, v30.8H // .......................................*........... + str q28, [x0, #320] // ..................*................................ + // gap // ................................................... + sub v28.8H, v23.8H, v14.8H // ...........................*....................... + mls v22.8H, v26.8H, v7.H[0] // .....................*............................. + // gap // ................................................... + // gap // ................................................... + add v23.8H, v23.8H, v14.8H // .............................*..................... + mls v24.8H, v27.8H, v7.H[0] // ..............................*.................... + // gap // ................................................... + // gap // ................................................... + sqrdmulh v27.8H, v28.8H, v0.H[1] // ....................................*.............. + mul v28.8H, v28.8H, v0.H[0] // .....................................*............. + // gap // ................................................... + // gap // ................................................... + sub v26.8H, v22.8H, v3.8H // .........................*......................... + add v22.8H, v22.8H, v3.8H // ..........................*........................ + // gap // ................................................... + // gap // ................................................... + sqrdmulh v3.8H, v23.8H, v30.8H // ...............................*................... + mul v23.8H, v23.8H, v29.8H // ................................*.................. + str q24, [x0, #256] // ...................................*............... + // gap // ................................................... + mul v24.8H, v26.8H, v0.H[0] // ............................*...................... + mul v11.8H, v22.8H, v29.8H // .................................*................. + // gap // ................................................... + // gap // ................................................... + mls v28.8H, v27.8H, v7.H[0] // ...........................................*....... + sqrdmulh v27.8H, v26.8H, v0.H[1] // ..................................*................ + // gap // ................................................... + // gap // ................................................... + sqrdmulh v22.8H, v22.8H, v30.8H // .........................................*......... + mls v23.8H, v3.8H, v7.H[0] // ........................................*.......... + // gap // ................................................... + // gap // ................................................... + mls v20.8H, v19.8H, v7.H[0] // ............................................*...... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + mls v24.8H, v27.8H, v7.H[0] // ..........................................*........ + str q28, [x0, #448] // ................................................*.. + // gap // ................................................... + // gap // ................................................... + mls v11.8H, v22.8H, v7.H[0] // ...............................................*... + str q23, [x0, #192] // .............................................*..... + // gap // ................................................... + // gap // ................................................... + str q20, [x0], #(16) // .................................................*. + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + str q24, [x0, #368] // ..............................................*.... + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + str q11, [x0, #112] // ..................................................* + // gap // ................................................... + // gap // ................................................... + // gap // ................................................... + + // original source code + // sub v3.8H, v22.8H, v16.8H // *.................................................. + // add v22.8H, v6.8H, v9.8H // .*................................................. + // add v18.8H, v25.8H, v15.8H // ....*.............................................. + // sqrdmulh v5.8H, v12.8H, v0.H[5] // ..*................................................ + // mls v4.8H, v19.8H, v7.H[0] // .......*........................................... + // sqrdmulh v19.8H, v27.8H, v0.H[1] // .....*............................................. + // add v16.8H, v22.8H, v18.8H // ..........*........................................ + // mul v23.8H, v27.8H, v0.H[0] // ......*............................................ + // mls v14.8H, v5.8H, v7.H[0] // .........*......................................... + // sub v27.8H, v22.8H, v18.8H // .............*..................................... + // mul v5.8H, v20.8H, v0.H[4] // ...*............................................... + // mul v18.8H, v3.8H, v0.H[2] // ............*...................................... + // str q4, [x0, #64] // ..............*.................................... + // sqrdmulh v3.8H, v3.8H, v0.H[3] // ...........*....................................... + // mls v23.8H, v19.8H, v7.H[0] // .................*................................. + // mls v5.8H, v11.8H, v7.H[0] // ........*.......................................... + // mls v18.8H, v3.8H, v7.H[0] // ...................*............................... + // sqrdmulh v20.8H, v27.8H, v0.H[3] // ..................*................................ + // str q23, [x0, #320] // .........................*......................... + // sub v19.8H, v16.8H, v13.8H // ...............*................................... + // mul v3.8H, v27.8H, v0.H[2] // ....................*.............................. + // mls v3.8H, v20.8H, v7.H[0] // ...........................*....................... + // add v11.8H, v16.8H, v13.8H // ................*.................................. + // sqrdmulh v23.8H, v19.8H, v0.H[1] // .....................*............................. + // mul v4.8H, v19.8H, v0.H[0] // ......................*............................ + // sub v19.8H, v3.8H, v5.8H // ................................*.................. + // add v3.8H, v3.8H, v5.8H // .................................*................. + // sub v24.8H, v18.8H, v14.8H // ..........................*........................ + // mul v8.8H, v19.8H, v0.H[0] // .....................................*............. + // add v5.8H, v18.8H, v14.8H // ............................*...................... + // mls v4.8H, v23.8H, v7.H[0] // .............................*..................... + // sqrdmulh v17.8H, v5.8H, v30.8H // ..................................*................ + // mul v2.8H, v5.8H, v29.8H // ...................................*............... + // mul v14.8H, v3.8H, v29.8H // ......................................*............ + // sqrdmulh v5.8H, v19.8H, v0.H[1] // ........................................*.......... + // str q4, [x0, #256] // ....................................*.............. + // sqrdmulh v20.8H, v24.8H, v0.H[1] // ..............................*.................... + // mul v13.8H, v24.8H, v0.H[0] // ...............................*................... + // mul v24.8H, v11.8H, v29.8H // .......................*........................... + // sqrdmulh v19.8H, v11.8H, v30.8H // ........................*.......................... + // mls v2.8H, v17.8H, v7.H[0] // ..........................................*........ + // sqrdmulh v3.8H, v3.8H, v30.8H // .........................................*......... + // mls v8.8H, v5.8H, v7.H[0] // ............................................*...... + // mls v13.8H, v20.8H, v7.H[0] // .......................................*........... + // mls v24.8H, v19.8H, v7.H[0] // ...........................................*....... + // str q2, [x0, #192] // ...............................................*... + // str q8, [x0, #384] // .................................................*. + // mls v14.8H, v3.8H, v7.H[0] // ..............................................*.... + // str q13, [x0, #448] // .............................................*..... + // str q24, [x0], #(16) // ................................................*.. + // str q14, [x0, #112] // ..................................................* + + + pop_stack + ret \ No newline at end of file diff --git a/examples/opt/aarch64/intt_kyber_123_45_67_twiddles.s b/examples/opt/aarch64/intt_kyber_123_45_67_twiddles.s new file mode 100644 index 00000000..b279f651 --- /dev/null +++ b/examples/opt/aarch64/intt_kyber_123_45_67_twiddles.s @@ -0,0 +1,493 @@ + +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +roots_l56: +.short -910 +.short -910 +.short -1227 +.short -1227 +.short 219 +.short 219 +.short 855 +.short 855 +.short -8957 +.short -8957 +.short -12078 +.short -12078 +.short 2156 +.short 2156 +.short 8416 +.short 8416 +.short 1175 +.short 1175 +.short 394 +.short 394 +.short -1029 +.short -1029 +.short -1212 +.short -1212 +.short 11566 +.short 11566 +.short 3878 +.short 3878 +.short -10129 +.short -10129 +.short -11930 +.short -11930 +.short -885 +.short -885 +.short 1219 +.short 1219 +.short 1455 +.short 1455 +.short 1607 +.short 1607 +.short -8711 +.short -8711 +.short 11999 +.short 11999 +.short 14322 +.short 14322 +.short 15818 +.short 15818 +.short -648 +.short -648 +.short -1481 +.short -1481 +.short 712 +.short 712 +.short 682 +.short 682 +.short -6378 +.short -6378 +.short -14578 +.short -14578 +.short 7008 +.short 7008 +.short 6713 +.short 6713 +.short -886 +.short -886 +.short 1179 +.short 1179 +.short -1026 +.short -1026 +.short -1092 +.short -1092 +.short -8721 +.short -8721 +.short 11605 +.short 11605 +.short -10099 +.short -10099 +.short -10749 +.short -10749 +.short 554 +.short 554 +.short -1143 +.short -1143 +.short -403 +.short -403 +.short 525 +.short 525 +.short 5453 +.short 5453 +.short -11251 +.short -11251 +.short -3967 +.short -3967 +.short 5168 +.short 5168 +.short 927 +.short 927 +.short -1534 +.short -1534 +.short 461 +.short 461 +.short -1438 +.short -1438 +.short 9125 +.short 9125 +.short -15099 +.short -15099 +.short 4538 +.short 4538 +.short -14155 +.short -14155 +.short 735 +.short 735 +.short -561 +.short -561 +.short -757 +.short -757 +.short -319 +.short -319 +.short 7235 +.short 7235 +.short -5522 +.short -5522 +.short -7451 +.short -7451 +.short -3140 +.short -3140 +.short 863 +.short 863 +.short 1230 +.short 1230 +.short 556 +.short 556 +.short -1063 +.short -1063 +.short 8495 +.short 8495 +.short 12107 +.short 12107 +.short 5473 +.short 5473 +.short -10463 +.short -10463 +.short -452 +.short -452 +.short -807 +.short -807 +.short -1435 +.short -1435 +.short 1010 +.short 1010 +.short -4449 +.short -4449 +.short -7943 +.short -7943 +.short -14125 +.short -14125 +.short 9942 +.short 9942 +.short -1645 +.short -1645 +.short 780 +.short 780 +.short 109 +.short 109 +.short 1031 +.short 1031 +.short -16192 +.short -16192 +.short 7678 +.short 7678 +.short 1073 +.short 1073 +.short 10148 +.short 10148 +.short 1239 +.short 1239 +.short -375 +.short -375 +.short 1292 +.short 1292 +.short -1584 +.short -1584 +.short 12196 +.short 12196 +.short -3691 +.short -3691 +.short 12717 +.short 12717 +.short -15592 +.short -15592 +.short 1414 +.short 1414 +.short -1320 +.short -1320 +.short -33 +.short -33 +.short 464 +.short 464 +.short 13918 +.short 13918 +.short -12993 +.short -12993 +.short -325 +.short -325 +.short 4567 +.short 4567 +.short -641 +.short -641 +.short 992 +.short 992 +.short 941 +.short 941 +.short 1021 +.short 1021 +.short -6309 +.short -6309 +.short 9764 +.short 9764 +.short 9262 +.short 9262 +.short 10050 +.short 10050 +.short -268 +.short -268 +.short -733 +.short -733 +.short 892 +.short 892 +.short -939 +.short -939 +.short -2638 +.short -2638 +.short -7215 +.short -7215 +.short 8780 +.short 8780 +.short -9243 +.short -9243 +.short -632 +.short -632 +.short 816 +.short 816 +.short 1352 +.short 1352 +.short -650 +.short -650 +.short -6221 +.short -6221 +.short 8032 +.short 8032 +.short 13308 +.short 13308 +.short -6398 +.short -6398 +.short 642 +.short 642 +.short -952 +.short -952 +.short 1540 +.short 1540 +.short -1651 +.short -1651 +.short 6319 +.short 6319 +.short -9371 +.short -9371 +.short 15159 +.short 15159 +.short -16251 +.short -16251 +.short -1461 +.short -1461 +.short 1482 +.short 1482 +.short 540 +.short 540 +.short 1626 +.short 1626 +.short -14381 +.short -14381 +.short 14588 +.short 14588 +.short 5315 +.short 5315 +.short 16005 +.short 16005 +.short 1274 +.short 1274 +.short 1052 +.short 1052 +.short 1025 +.short 1025 +.short -1197 +.short -1197 +.short 12540 +.short 12540 +.short 10355 +.short 10355 +.short 10089 +.short 10089 +.short -11782 +.short -11782 +.short 279 +.short 279 +.short 1173 +.short 1173 +.short -233 +.short -233 +.short 667 +.short 667 +.short 2746 +.short 2746 +.short 11546 +.short 11546 +.short -2293 +.short -2293 +.short 6565 +.short 6565 +.short 314 +.short 314 +.short -756 +.short -756 +.short 48 +.short 48 +.short -1409 +.short -1409 +.short 3091 +.short 3091 +.short -7441 +.short -7441 +.short 472 +.short 472 +.short -13869 +.short -13869 +.short 1573 +.short 1573 +.short 76 +.short 76 +.short -331 +.short -331 +.short -289 +.short -289 +.short 15483 +.short 15483 +.short 748 +.short 748 +.short -3258 +.short -3258 +.short -2845 +.short -2845 +.short -1100 +.short -1100 +.short -723 +.short -723 +.short 680 +.short 680 +.short 568 +.short 568 +.short -10828 +.short -10828 +.short -7117 +.short -7117 +.short 6693 +.short 6693 +.short 5591 +.short 5591 +.short 1041 +.short 1041 +.short -1637 +.short -1637 +.short -583 +.short -583 +.short -17 +.short -17 +.short 10247 +.short 10247 +.short -16113 +.short -16113 +.short -5739 +.short -5739 +.short -167 +.short -167 +roots_l34: +.short 1583 +.short 15582 +.short -821 +.short -8081 +.short 1355 +.short 13338 +.short 0 +.short 0 +.short -569 +.short -5601 +.short 450 +.short 4429 +.short 936 +.short 9213 +.short 0 +.short 0 +.short 69 +.short 679 +.short 447 +.short 4400 +.short -535 +.short -5266 +.short 0 +.short 0 +.short 543 +.short 5345 +.short 1235 +.short 12156 +.short -1426 +.short -14036 +.short 0 +.short 0 +.short -797 +.short -7845 +.short -1333 +.short -13121 +.short 1089 +.short 10719 +.short 0 +.short 0 +.short -193 +.short -1900 +.short -56 +.short -551 +.short 283 +.short 2786 +.short 0 +.short 0 +.short 1410 +.short 13879 +.short -1476 +.short -14529 +.short -1339 +.short -13180 +.short 0 +.short 0 +.short -1062 +.short -10453 +.short 882 +.short 8682 +.short -296 +.short -2914 +.short 0 +.short 0 +roots_l012: +.short 1600 +.short 15749 +.short 40 +.short 394 +.short 749 +.short 7373 +.short -848 +.short -8347 +.short 1432 +.short 14095 +.short -630 +.short -6201 +.short 687 +.short 6762 +.short 0 +.short 0 \ No newline at end of file diff --git a/examples/opt/aarch64/ntt_dilithium_1234_5678_twiddles.s b/examples/opt/aarch64/ntt_dilithium_1234_5678_twiddles.s new file mode 100644 index 00000000..78afb4dc --- /dev/null +++ b/examples/opt/aarch64/ntt_dilithium_1234_5678_twiddles.s @@ -0,0 +1,541 @@ + +/// +/// Copyright (c) 2022 Arm Limited +/// Copyright (c) 2022 Hanno Becker +/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +/// + +roots_l0123: +.word -3572223 +.word -915382907 +.word 3765607 +.word 964937599 +.word 3761513 +.word 963888510 +.word -3201494 +.word -820383522 +.word -2883726 +.word -738955404 +.word -3145678 +.word -806080660 +.word -3201430 +.word -820367122 +.word -601683 +.word -154181397 +.word 3542485 +.word 907762539 +.word 2682288 +.word 687336873 +.word 2129892 +.word 545785280 +.word 3764867 +.word 964747974 +.word -1005239 +.word -257592709 +.word 557458 +.word 142848732 +.word -1221177 +.word -312926867 +.word 0 +.word 0 +roots_l45: +.word -3370349 +.word -863652652 +.word 3602218 +.word 923069133 +.word 3182878 +.word 815613168 +.word -4063053 +.word -1041158200 +.word 2740543 +.word 702264730 +.word -3586446 +.word -919027554 +.word 2663378 +.word 682491182 +.word -3110818 +.word -797147778 +.word 2101410 +.word 538486762 +.word -1674615 +.word -429120452 +.word 3704823 +.word 949361686 +.word 1159875 +.word 297218217 +.word -3524442 +.word -903139016 +.word 394148 +.word 101000509 +.word 928749 +.word 237992130 +.word -434125 +.word -111244624 +.word 1095468 +.word 280713909 +.word -3506380 +.word -898510625 +.word 676590 +.word 173376332 +.word 2071829 +.word 530906624 +.word -4018989 +.word -1029866791 +.word -1335936 +.word -342333886 +.word 3241972 +.word 830756018 +.word 2156050 +.word 552488273 +.word -3227876 +.word -827143915 +.word 3415069 +.word 875112161 +.word 1759347 +.word 450833045 +.word 1714295 +.word 439288460 +.word -817536 +.word -209493775 +.word -3574466 +.word -915957677 +.word 2453983 +.word 628833668 +.word 3756790 +.word 962678241 +.word -1935799 +.word -496048908 +.word 1460718 +.word 374309300 +.word -1716988 +.word -439978542 +.word -3950053 +.word -1012201926 +.word -642628 +.word -164673562 +.word -2897314 +.word -742437332 +.word 3192354 +.word 818041395 +.word -3585098 +.word -918682129 +.word 556856 +.word 142694469 +.word 3870317 +.word 991769559 +.word 2815639 +.word 721508096 +.word 2917338 +.word 747568486 +.word 1853806 +.word 475038184 +.word 2283733 +.word 585207070 +.word 3345963 +.word 857403734 +.word 1858416 +.word 476219497 +roots_l67: +.word 3073009 +.word 1277625 +.word -2635473 +.word 3852015 +.word 787459213 +.word 327391679 +.word -675340520 +.word 987079667 +.word 1753 +.word -2659525 +.word 2660408 +.word -59148 +.word 449207 +.word -681503850 +.word 681730119 +.word -15156688 +.word -1935420 +.word -1455890 +.word -1780227 +.word 2772600 +.word -495951789 +.word -373072124 +.word -456183549 +.word 710479343 +.word 4183372 +.word -3222807 +.word -3121440 +.word -274060 +.word 1071989969 +.word -825844983 +.word -799869667 +.word -70227934 +.word 1182243 +.word 636927 +.word -3956745 +.word -3284915 +.word 302950022 +.word 163212680 +.word -1013916752 +.word -841760171 +.word 87208 +.word -3965306 +.word -2296397 +.word -3716946 +.word 22347069 +.word -1016110510 +.word -588452222 +.word -952468207 +.word 2508980 +.word 2028118 +.word 1937570 +.word -3815725 +.word 642926661 +.word 519705671 +.word 496502727 +.word -977780347 +.word -27812 +.word 1009365 +.word -1979497 +.word -3956944 +.word -7126831 +.word 258649997 +.word -507246529 +.word -1013967746 +.word 822541 +.word -2454145 +.word 1596822 +.word -3759465 +.word 210776307 +.word -628875181 +.word 409185979 +.word -963363710 +.word 2811291 +.word -2983781 +.word -1109516 +.word 4158088 +.word 720393920 +.word -764594519 +.word -284313712 +.word 1065510939 +.word -1685153 +.word 2678278 +.word -3551006 +.word -250446 +.word -431820817 +.word 686309310 +.word -909946047 +.word -64176841 +.word -3410568 +.word -3768948 +.word 635956 +.word -2455377 +.word -873958779 +.word -965793731 +.word 162963861 +.word -629190881 +.word 1528066 +.word 482649 +.word 1148858 +.word -2962264 +.word 391567239 +.word 123678909 +.word 294395108 +.word -759080783 +.word -4146264 +.word 2192938 +.word 2387513 +.word -268456 +.word -1062481036 +.word 561940831 +.word 611800717 +.word -68791907 +.word -1772588 +.word -1727088 +.word -3611750 +.word -3180456 +.word -454226054 +.word -442566669 +.word -925511710 +.word -814992530 +.word -565603 +.word 169688 +.word 2462444 +.word -3334383 +.word -144935890 +.word 43482586 +.word 631001801 +.word -854436357 +.word 3747250 +.word 1239911 +.word 3195676 +.word 1254190 +.word 960233614 +.word 317727459 +.word 818892658 +.word 321386456 +.word 2296099 +.word -3838479 +.word 2642980 +.word -12417 +.word 588375860 +.word -983611064 +.word 677264190 +.word -3181859 +.word -4166425 +.word -3488383 +.word 1987814 +.word -3197248 +.word -1067647297 +.word -893898890 +.word 509377762 +.word -819295484 +.word 2998219 +.word -89301 +.word -1354892 +.word -1310261 +.word 768294260 +.word -22883400 +.word -347191365 +.word -335754661 +.word 141835 +.word 2513018 +.word 613238 +.word -2218467 +.word 36345249 +.word 643961400 +.word 157142369 +.word -568482643 +.word 1736313 +.word 235407 +.word -3250154 +.word 3258457 +.word 444930577 +.word 60323094 +.word -832852657 +.word 834980303 +.word -458740 +.word 4040196 +.word 2039144 +.word -818761 +.word -117552223 +.word 1035301089 +.word 522531086 +.word -209807681 +.word -1921994 +.word -3472069 +.word -1879878 +.word -2178965 +.word -492511373 +.word -889718424 +.word -481719139 +.word -558360247 +.word -2579253 +.word 1787943 +.word -2391089 +.word -2254727 +.word -660934133 +.word 458160776 +.word -612717067 +.word -577774276 +.word -1623354 +.word -2374402 +.word 586241 +.word 527981 +.word -415984810 +.word -608441020 +.word 150224382 +.word 135295244 +.word 2105286 +.word -2033807 +.word -1179613 +.word -2743411 +.word 539479988 +.word -521163479 +.word -302276083 +.word -702999655 +.word 3482206 +.word -4182915 +.word -1300016 +.word -2362063 +.word 892316032 +.word -1071872863 +.word -333129378 +.word -605279149 +.word -1476985 +.word 2491325 +.word 507927 +.word -724804 +.word -378477722 +.word 638402564 +.word 130156402 +.word -185731180 +.word 1994046 +.word -1393159 +.word -1187885 +.word -1834526 +.word 510974714 +.word -356997292 +.word -304395785 +.word -470097680 +.word -1317678 +.word 2461387 +.word 3035980 +.word 621164 +.word -337655269 +.word 630730945 +.word 777970524 +.word 159173408 +.word -3033742 +.word 2647994 +.word -2612853 +.word 749577 +.word -777397036 +.word 678549029 +.word -669544140 +.word 192079267 +.word -338420 +.word 3009748 +.word 4148469 +.word -4022750 +.word -86720197 +.word 771248568 +.word 1063046068 +.word -1030830548 +.word 3901472 +.word -1226661 +.word 2925816 +.word 3374250 +.word 999753034 +.word -314332144 +.word 749740976 +.word 864652284 +.word 3980599 +.word -1615530 +.word 1665318 +.word 1163598 +.word 1020029345 +.word -413979908 +.word 426738094 +.word 298172236 +.word 2569011 +.word 1723229 +.word 2028038 +.word -3369273 +.word 658309618 +.word 441577800 +.word 519685171 +.word -863376927 +.word 1356448 +.word -2775755 +.word 2683270 +.word -2778788 +.word 347590090 +.word -711287812 +.word 687588511 +.word -712065019 +.word 3994671 +.word -1370517 +.word 3363542 +.word 545376 +.word 1023635298 +.word -351195274 +.word 861908357 +.word 139752717 +.word -11879 +.word 3020393 +.word 214880 +.word -770441 +.word -3043996 +.word 773976352 +.word 55063046 +.word -197425671 +.word -3467665 +.word 2312838 +.word -653275 +.word -459163 +.word -888589898 +.word 592665232 +.word -167401858 +.word -117660617 +.word 3105558 +.word 508145 +.word 860144 +.word 140244 +.word 795799901 +.word 130212265 +.word 220412084 +.word 35937555 +.word -1103344 +.word -553718 +.word 3430436 +.word -1514152 +.word -282732136 +.word -141890356 +.word 879049958 +.word -388001774 +.word 348812 +.word -327848 +.word 1011223 +.word -2354215 +.word 89383150 +.word -84011120 +.word 259126110 +.word -603268097 +.word -2185084 +.word 2358373 +.word -3014420 +.word 2926054 +.word -559928242 +.word 604333585 +.word -772445769 +.word 749801963 +.word 3123762 +.word -2193087 +.word -1716814 +.word -392707 +.word 800464680 +.word -561979013 +.word -439933955 +.word -100631253 +.word -3818627 +.word -1922253 +.word -2236726 +.word 1744507 +.word -978523985 +.word -492577742 +.word -573161516 +.word 447030292 +.word -303005 +.word -3974485 +.word 1900052 +.word 1054478 +.word -77645096 +.word -1018462631 +.word 486888731 +.word 270210213 +.word 3531229 +.word -3773731 +.word -781875 +.word -731434 +.word 904878186 +.word -967019376 +.word -200355636 +.word -187430119 \ No newline at end of file diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py index 9fb54ecb..e5c79634 100644 --- a/slothy/targets/aarch64/aarch64_neon.py +++ b/slothy/targets/aarch64/aarch64_neon.py @@ -2903,6 +2903,15 @@ def make(cls, src): ] return obj + +class ASimdCompare(AArch64Instruction): + """Parent class for ASIMD compare instructions""" + +class cmge(ASimdCompare): # pylint: disable=missing-docstring,invalid-name + pattern = "cmge ., ., ." + inputs = ["Va", "Vb"] + outputs = ["Vd"] + # In a pair of vins writing both 64-bit lanes of a vector, mark the # target vector as output rather than input/output. This enables further # renaming opportunities. diff --git a/slothy/targets/aarch64/apple_m1_firestorm_experimental.py b/slothy/targets/aarch64/apple_m1_firestorm_experimental.py index 63e7656e..5c327d32 100644 --- a/slothy/targets/aarch64/apple_m1_firestorm_experimental.py +++ b/slothy/targets/aarch64/apple_m1_firestorm_experimental.py @@ -120,7 +120,7 @@ def get_min_max_objective(slothy): vqdmulh_lane, vmull, vmlal, vsrshr, vushr, vusra, vshl, - vand, vbic): ExecutionUnit.V(), + vand, vbic, ASimdCompare): ExecutionUnit.V(), (vadd, vsub, trn1, trn2): ExecutionUnit.V(), Vins: ExecutionUnit.V(), # guessed @@ -184,7 +184,7 @@ def get_min_max_objective(slothy): vqdmulh_lane, vmull, vmlal, vsrshr, vushr, vusra, vshl, - vand, vbic): 1, + vand, vbic, ASimdCompare): 1, (vadd, vsub, trn1, trn2): 1, @@ -239,7 +239,7 @@ def get_min_max_objective(slothy): vmull, vmlal, vsrshr, vusra): 3, (vshl, vushr, - vand, vbic): 2, + vand, vbic, ASimdCompare): 2, (vadd, vsub, trn1, trn2): 2, Vins: 2, # or something less than 13 diff --git a/slothy/targets/aarch64/apple_m1_icestorm_experimental.py b/slothy/targets/aarch64/apple_m1_icestorm_experimental.py index 6b020207..dd2fa4d5 100644 --- a/slothy/targets/aarch64/apple_m1_icestorm_experimental.py +++ b/slothy/targets/aarch64/apple_m1_icestorm_experimental.py @@ -98,7 +98,7 @@ def get_min_max_objective(slothy): vqdmulh_lane, vmull, vmlal, vsrshr, vushr, vusra, vshl, - vand, vbic): ExecutionUnit.V(), + vand, vbic, ASimdCompare): ExecutionUnit.V(), (vadd, vsub, trn1, trn2): ExecutionUnit.V(), @@ -154,7 +154,7 @@ def get_min_max_objective(slothy): vqdmulh_lane, vmull, vmlal, vsrshr, vushr, vusra, vshl, - vand, vbic): 1, + vand, vbic, ASimdCompare): 1, (vadd, vsub, trn1, trn2): 1, @@ -209,7 +209,7 @@ def get_min_max_objective(slothy): vmull, vmlal, vsrshr, vusra): 3, (vshl, vushr, - vand, vbic): 2, + vand, vbic, ASimdCompare): 2, (vadd, vsub, trn1, trn2): 2, Vins: 2, # 2 or <= 9 diff --git a/slothy/targets/aarch64/cortex_a55.py b/slothy/targets/aarch64/cortex_a55.py index 0aa6562f..8fee5c2a 100644 --- a/slothy/targets/aarch64/cortex_a55.py +++ b/slothy/targets/aarch64/cortex_a55.py @@ -118,6 +118,8 @@ def get_min_max_objective(slothy): St4 : [[ExecutionUnit.VEC0, ExecutionUnit.VEC1, ExecutionUnit.SCALAR_LOAD, ExecutionUnit.SCALAR_STORE] + ExecutionUnit.SCALAR()], + Ld4: [[ExecutionUnit.VEC0, ExecutionUnit.VEC1, ExecutionUnit.SCALAR_LOAD] + + ExecutionUnit.SCALAR()], # non-q-form vector instructions ( umov_d, mov_d01, mov_b00, @@ -137,6 +139,9 @@ def get_min_max_objective(slothy): is_qform_form_of(trn2) : [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], is_dform_form_of(trn2) : [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(cmge): [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], + is_dform_form_of(cmge): [ExecutionUnit.VEC0, ExecutionUnit.VEC1], + is_qform_form_of(vzip1) : [[ExecutionUnit.VEC0, ExecutionUnit.VEC1]], is_dform_form_of(vzip1) : [ExecutionUnit.VEC0, ExecutionUnit.VEC1], @@ -172,12 +177,13 @@ def get_min_max_objective(slothy): vqrdmulh, vqrdmulh_lane, vqdmulh_lane, vmull, vmlal, vsrshr, umov_d ) : 1, - (trn2, trn1) : 1, + (trn2, trn1, ASimdCompare): 1, ( Ldr_Q ) : 2, ( Str_Q ) : 1, ( tst_wform ) : 1, ( nop, Vins, Ldr_X, Str_X ) : 1, St4 : 5, + Ld4 : 9, (fcsel_dform) : 1, (VecToGprMov, Mov_xtov_d) : 1, (movk_imm, mov) : 1, @@ -209,13 +215,15 @@ def get_min_max_objective(slothy): is_qform_form_of([vadd, vsub]) : 3, is_dform_form_of([vadd, vsub]) : 2, - ( trn1, trn2) : 2, + (trn1, trn2, ASimdCompare): 2, ( vsrshr ) : 3, ( vmul, vmul_lane, vmls, vmls_lane, vqrdmulh, vqrdmulh_lane, vqdmulh_lane, vmull, vmlal) : 4, ( Ldr_Q, Str_Q ) : 4, St4 : 5, + # TODO: Add distinction between Q/D and B/H vs. D/S + Ld4 : 11, ( Str_X, Ldr_X ) : 4, ( Vins, umov_d ) : 2, ( tst_wform) : 1, diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py index 81e408ff..77bc9df2 100644 --- a/slothy/targets/aarch64/cortex_a72_frontend.py +++ b/slothy/targets/aarch64/cortex_a72_frontend.py @@ -116,7 +116,7 @@ def get_min_max_objective(slothy): : [ExecutionUnit.ASIMD0], (vadd, vsub, - trn1, trn2 ) + trn1, trn2, ASimdCompare ) : [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], Vins : [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], @@ -147,7 +147,7 @@ def get_min_max_objective(slothy): : 2, (vadd, vsub, - trn1, trn2) + trn1, trn2, ASimdCompare) : 1, Vins : 1, @@ -176,7 +176,7 @@ def get_min_max_objective(slothy): : 5, (vadd, vsub, - trn1, trn2 ) + trn1, trn2, ASimdCompare ) : 3, # Approximation -- not necessary to get it exactly right, as mentioned above ( Ldr_Q, Ldr_X,